Baseline 1

In [93]:
import spacy
import pandas as pd
import re
import string
import numpy as np

## Dataset preparation
Importing data

In [94]:
tokens = spacy.load("en_core_web_sm")
dataset = pd.read_csv("../data/Organic_extended_finalv2.csv",sep="|", index_col=0)
dataset.columns, len(dataset)

(Index(['tweet_id', 'created_time', 'count', '1', '2', '3', '4', '5', '6',
        'user_id', 'screen_name', 'url', 'follower_count', 'title', 'content',
        'expanded_url', 'title_len', 'content_len'],
       dtype='object'),
 12341)

In [95]:
dataset['title'].head()

0    ‘All Black Lives Matter’ painted on Hollywood ...
1    Millions in lawsuit settlements are another hi...
2    Woman becomes first observant Sikh to graduate...
3    As Social Distancing Wanes, Cuomo Warns of Ano...
4    They lost loved ones to police violence. Georg...
Name: title, dtype: object

Cleaning text
- remove numbers
- remove punctutations
- remove tabs, next lines
- convert text to lower

In [96]:
def clean_text(text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text.lower())
    return nopunct.strip()

In [97]:
dataset['title'] = dataset['title'].apply(clean_text)
dataset['title'].head()

0    all black lives matter  painted on hollywood b...
1    millions in lawsuit settlements are another hi...
2    woman becomes first observant sikh to graduate...
3    as social distancing wanes  cuomo warns of ano...
4    they lost loved ones to police violence  georg...
Name: title, dtype: object

Getting max tweets for a row

In [98]:
possible_cols = {str(x) for x in range(1,101)}
actual_cols = set(dataset.columns).intersection(possible_cols)
print(actual_cols)
dataset['max_retweets']= dataset[actual_cols].max(axis=1)
# dataset[[x for x in actual_cols] + ['max_retweets']].head()

{'1', '3', '5', '4', '6', '2'}


Setting labels

In [130]:
dataset['label'] = 0
dataset['median'] = dataset.groupby('user_id')['max_retweets'].transform('median')
dataset.loc[dataset['max_retweets']>=dataset['median'],'label'] = 1
dataset[['user_id','max_retweets','median','label']].head(15)

Unnamed: 0,user_id,max_retweets,median,label
0,16664681.0,466.0,18.0,1
1,28785486.0,163.0,77.0,1
2,759251.0,936.0,141.0,1
3,807095.0,2381.0,113.0,1
4,16664681.0,267.0,18.0,1
5,5392522.0,938.0,117.0,1
6,16664681.0,33.0,18.0,1
7,16664681.0,72.0,18.0,1
8,16664681.0,48.0,18.0,1
9,16664681.0,70.0,18.0,1


default baseline - using median threshold

In [100]:
len(dataset.loc[dataset['label']==1])/len(dataset),len(dataset.loc[dataset['label']==0])/len(dataset)

(0.5073332793128595, 0.4926667206871404)

## Baseline coding

In [101]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report


- dataset vectorization
- shuffling dataset and creating train test split

In [117]:
vectorizer = CountVectorizer()

X_train, X_test, y_train, y_test = train_test_split(dataset['title'],dataset['label'],
                                                    test_size=0.4, random_state=12345,shuffle=True)
# vectorizer.fit(X_train)
# print(vectorizer.vocabulary_)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(7404, 9178) (7404,) (4937, 9178) (4937,)


training the model

In [118]:
classifier = LinearSVC(random_state=12345)
classifier.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=12345, tol=0.0001,
          verbose=0)

evaluation

In [119]:
y_pred = classifier.predict(X_test)

In [120]:
print(classification_report(y_test, y_pred,target_names=['Non Viral', 'Viral']))

              precision    recall  f1-score   support

   Non Viral       0.68      0.65      0.66      2416
       Viral       0.68      0.70      0.69      2521

    accuracy                           0.68      4937
   macro avg       0.68      0.68      0.68      4937
weighted avg       0.68      0.68      0.68      4937



In [121]:
print(classification_report(y_train,classifier.predict(X_train) ,target_names=['Non Viral', 'Viral']))

              precision    recall  f1-score   support

   Non Viral       0.94      0.93      0.94      3664
       Viral       0.93      0.95      0.94      3740

    accuracy                           0.94      7404
   macro avg       0.94      0.94      0.94      7404
weighted avg       0.94      0.94      0.94      7404

