<a href="https://colab.research.google.com/github/ritzdevp/Tweet-Author-Classification/blob/main/models/EDL_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
!git clone https://github.com/ritzdevp/Tweet-Author-Classification.git

Cloning into 'Tweet-Author-Classification'...
remote: Enumerating objects: 97, done.[K
remote: Counting objects: 100% (97/97), done.[K
remote: Compressing objects: 100% (68/68), done.[K
remote: Total 97 (delta 41), reused 72 (delta 25), pack-reused 0[K
Unpacking objects: 100% (97/97), done.


In [3]:
data = pd.read_csv('/content/Tweet-Author-Classification/cleaned_tweets.csv', index_col=[0])

In [4]:
len(data)

38230

In [5]:
authors2remove = ['elonmusk', 'tyler', 'joebiden', 'openai', 'jeffbezos', 'dhh','sundarpichai', 'ivankatrump']

In [6]:
data.columns = ['tweet', 'author']

In [7]:
data.head()

Unnamed: 0,tweet,author
0,🤣,elonmusk
1,♥ ️ 🚀 ✨,elonmusk
2,spacex falcon team making great progress aimin...,elonmusk
3,thank sir,elonmusk
4,🤣 🤣,elonmusk


In [8]:
data = data[data["author"].str.contains("elonmusk|tyler|joebiden|openai|jeffbezos|dhh|sundarpichai|ivankatrump|narendramodi") == False]

In [9]:
data = data.reset_index(drop=True)

In [10]:
data.head()

Unnamed: 0,tweet,author
0,wishing celebrate today blessed joyful easter ...,barackobama
1,michelle send warmest wish gathering tonight p...,barackobama
2,mayor washington meant lot michelle — like man...,barackobama
3,traveled world together he's captured iconic s...,barackobama
4,75th anniversary jackie robinson day we're rem...,barackobama


In [11]:
len(data)

31286

In [12]:
data['author'] = pd.factorize(data['author'])[0]

In [13]:
data.head()

Unnamed: 0,tweet,author
0,wishing celebrate today blessed joyful easter ...,0
1,michelle send warmest wish gathering tonight p...,0
2,mayor washington meant lot michelle — like man...,0
3,traveled world together he's captured iconic s...,0
4,75th anniversary jackie robinson day we're rem...,0


In [14]:
data = data.dropna()

In [15]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 526 kB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 8.4 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 36.9 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 21.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.

In [16]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [17]:
sentences = data['tweet']

In [18]:
len(list(sentences))

30939

In [19]:
sentences = data['tweet']
embeddings = model.encode(list(sentences))

In [20]:
embeddings.shape

(30939, 384)

In [21]:
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding.shape)
    break

Sentence: wishing celebrate today blessed joyful easter difficult couple year let give thanks gift enjoy — people make life special
Embedding: (384,)


In [22]:
bert_features = np.array(embeddings)

In [23]:
np.save('bert_features.npy', bert_features)

In [24]:
X = bert_features.copy()

In [25]:
y = list(data['author']).copy()

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [27]:
len(X_train[0])

384

In [28]:
num_classes = len(set(y_train))
print(num_classes)

13


In [58]:
labels = list(set(y_train))
print(labels)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


In [43]:
import time
from sklearn.metrics import classification_report

In [60]:
start_time = time.time()
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='adam', alpha=1e-5, learning_rate_init=0.001, learning_rate='adaptive',
                    hidden_layer_sizes=(384
                                        , 256, 64, 13), random_state=1, verbose=False, max_iter=100)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred, labels=labels))
print("Time", time.time() - start_time, "s")

0.5783236371471666
              precision    recall  f1-score   support

           0       0.80      0.77      0.78       808
           1       0.69      0.48      0.57       807
           2       0.53      0.51      0.52       754
           3       0.47      0.69      0.56       825
           4       0.55      0.48      0.51       528
           5       0.51      0.42      0.46       815
           6       0.44      0.60      0.50       769
           7       0.73      0.68      0.70       804
           8       0.68      0.61      0.65       794
           9       0.69      0.60      0.65       381
          10       0.51      0.45      0.48       685
          11       0.65      0.66      0.65       511
          12       0.49      0.56      0.52       801

    accuracy                           0.58      9282
   macro avg       0.60      0.58      0.58      9282
weighted avg       0.59      0.58      0.58      9282

Time 81.11099171638489 s


In [63]:
import time

# SVM

In [64]:
start_time = time.time()
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
svm_clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svm_clf.fit(X_train, y_train)
print("Time", time.time() - start_time, "s")

Time 79.27785778045654 s


In [65]:
pred = svm_clf.predict(X_test)
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred, labels=labels))

0.6412411118293472
              precision    recall  f1-score   support

           0       0.79      0.81      0.80       808
           1       0.68      0.70      0.69       807
           2       0.55      0.62      0.59       754
           3       0.66      0.59      0.62       825
           4       0.64      0.55      0.59       528
           5       0.51      0.58      0.55       815
           6       0.49      0.64      0.55       769
           7       0.74      0.73      0.74       804
           8       0.68      0.72      0.70       794
           9       0.76      0.60      0.67       381
          10       0.64      0.50      0.56       685
          11       0.75      0.67      0.71       511
          12       0.63      0.55      0.59       801

    accuracy                           0.64      9282
   macro avg       0.65      0.64      0.64      9282
weighted avg       0.65      0.64      0.64      9282



# RANDOM FOREST

In [66]:
start_time = time.time()
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(max_depth=10, random_state=0)
rf_clf.fit(X_train, y_train)
print("Time", time.time() - start_time, "s")

Time 31.250518321990967 s


In [67]:
pred = rf_clf.predict(X_test)
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred, labels=labels))

0.4861021331609567
              precision    recall  f1-score   support

           0       0.53      0.75      0.62       808
           1       0.50      0.63      0.56       807
           2       0.46      0.39      0.42       754
           3       0.43      0.56      0.48       825
           4       0.72      0.18      0.29       528
           5       0.40      0.33      0.36       815
           6       0.42      0.53      0.47       769
           7       0.49      0.74      0.59       804
           8       0.47      0.70      0.56       794
           9       0.83      0.23      0.36       381
          10       0.61      0.17      0.27       685
          11       0.78      0.42      0.55       511
          12       0.46      0.36      0.40       801

    accuracy                           0.49      9282
   macro avg       0.55      0.46      0.46      9282
weighted avg       0.52      0.49      0.47      9282



#LOGISTIC REGRESSION


In [68]:
start_time = time.time()
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(random_state=0, max_iter=100).fit(X_train, y_train)
print("Time", time.time() - start_time, "s")

Time 10.101628065109253 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [69]:
pred = lr_clf.predict(X_test)
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred, labels=labels))

0.6026718379659556
              precision    recall  f1-score   support

           0       0.72      0.76      0.74       808
           1       0.65      0.65      0.65       807
           2       0.53      0.55      0.54       754
           3       0.58      0.58      0.58       825
           4       0.57      0.55      0.56       528
           5       0.49      0.47      0.48       815
           6       0.49      0.54      0.52       769
           7       0.70      0.72      0.71       804
           8       0.65      0.68      0.66       794
           9       0.69      0.63      0.66       381
          10       0.56      0.50      0.53       685
          11       0.66      0.67      0.66       511
          12       0.57      0.54      0.56       801

    accuracy                           0.60      9282
   macro avg       0.60      0.60      0.60      9282
weighted avg       0.60      0.60      0.60      9282

