Baseline 1

In [None]:
import spacy
import pandas as pd
import re
import string
import numpy as np

## Dataset preparation
Importing data

In [None]:
tokens = spacy.load("en_core_web_sm")
dataset = pd.read_csv("../data/Organic_extended_finalv2.csv",sep="|", index_col=0)
dataset.columns, len(dataset)

In [None]:
dataset['title'].head()

Cleaning text
- remove numbers
- remove punctutations
- remove tabs, next lines
- convert text to lower

In [None]:
def clean_text(text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text.lower())
    return nopunct.strip()

In [None]:
dataset['title'] = dataset['title'].apply(clean_text)
dataset['title'].head()

Getting max tweets for a row

In [None]:
possible_cols = {str(x) for x in range(1,101)}
actual_cols = set(dataset.columns).intersection(possible_cols)
print(actual_cols)
dataset['max_retweets']= dataset[actual_cols].max(axis=1)
# dataset[[x for x in actual_cols] + ['max_retweets']].head()

Setting labels

In [None]:
dataset['label'] = 0
dataset['median'] = dataset.groupby('user_id')['max_retweets'].transform('median')
dataset.loc[dataset['max_retweets']>=dataset['median'],'label'] = 1
dataset[['user_id','max_retweets','median','label']].head(15)

default baseline - using median threshold

In [None]:
len(dataset.loc[dataset['label']==1])/len(dataset),len(dataset.loc[dataset['label']==0])/len(dataset)

## Baseline coding

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report


- dataset vectorization
- shuffling dataset and creating train test split

In [None]:
vectorizer = CountVectorizer()

X_train, X_test, y_train, y_test = train_test_split(dataset['title'],dataset['label'],
                                                    test_size=0.4, random_state=12345,shuffle=True)
# vectorizer.fit(X_train)
# print(vectorizer.vocabulary_)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

training the model

In [None]:
classifier = LinearSVC(random_state=12345)
classifier.fit(X_train, y_train)

evaluation

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred,target_names=['Non Viral', 'Viral']))

In [None]:
print(classification_report(y_train,classifier.predict(X_train) ,target_names=['Non Viral', 'Viral']))

## Shallow Learning Baseline Model

In [None]:
import pandas as pd
import numpy as np
import spacy
import re
import string
from collections import Counter


Loading tokenizer and dataset

In [None]:
tokenizer = spacy.load("en_core_web_sm")
dataset = pd.read_csv("../data/Organic_extended_finalv2.csv",sep="|", index_col=0)
dataset.columns, len(dataset)

Cleaning dataset

In [None]:
def clean_text(text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text.lower())
    return nopunct.strip()
dataset['title'] = dataset['title'].apply(clean_text)
dataset['title'].head()

Creating label based on median for max_retweet of a newsgroup.

In [None]:
possible_cols = {str(x) for x in range(1,101)}
actual_cols = set(dataset.columns).intersection(possible_cols)
print(actual_cols)
dataset['max_retweets']= dataset[actual_cols].max(axis=1)
dataset['label'] = 0
dataset['median'] = dataset.groupby('user_id')['max_retweets'].transform('median')
dataset.loc[dataset['max_retweets']>=dataset['median'],'label'] = 1
dataset[['user_id','max_retweets','median','label']].head(15)
# dataset[[x for x in actual_cols] + ['max_retweets']].head()

In [None]:
def counter(df, tokenizer):
    #count number of occurences of each word
    counts = Counter()
    for index, row in df.iterrows():
        counts.update([token.text for token in tokenizer.tokenizer(row['title'].strip(), )])
#         counts.update(tokenizer.tokenizer(row['title']))
    for i in range(1,100):
        if counts[' '*i] !=0:
            del counts[' '*i]
    return counts
counts = counter(dataset, tokenizer)