In [1]:
import pandas as pd
import numpy as np
import pickle
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import *
import string
import re
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import ExtraTreeClassifier

from sklearn.naive_bayes import BernoulliNB

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [2]:
data = pd.read_csv("labeled_data.csv",encoding="latin-1")
tweets = data.tweet

In [4]:
stopwords=stopwords = nltk.corpus.stopwords.words("english")

other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)

stemmer = PorterStemmer()


def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    return parsed_text

def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
    return tweet.split()

In [5]:
def count_twitter_objs(text_string):
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    emoticons_regex = '[\U0001f600-\U0001f650]'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    parsed_text = re.sub(emoticons_regex, 'EMOTICONHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'),parsed_text.count('EMOTICONHERE'))

In [6]:
def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

In [7]:
def features(tweet):
    words = preprocess(tweet) #Get text only
    syllables_all=0
    for word in words:
        syllables = syllable_count(words)
        syllables_all = syllables_all+syllables
        
    num_chars = sum(len(w) for w in words)
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables_all+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet)
    retweet = 0
    if "rt" in words:
        retweet = 1
    
    features = [FKRA, FRE,syllables_all, avg_syl, num_chars, num_chars_total, num_terms, num_words,num_unique_terms,
               twitter_objs[2], twitter_objs[1],twitter_objs[0],twitter_objs[3], retweet]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array(tweets):
    feats=[]
    for t in tweets:
        feats.append(features(t))
    return np.array(feats)

In [8]:
features_names = ["FKRA", "FRE","num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words","num_hashtags", "num_mentions", "num_urls", "num_emoticons","is_retweet"]

In [9]:
feats = get_feature_array(tweets)
M = np.concatenate([feats],axis=1)

X = pd.DataFrame(M)
y = data['class'].astype(int)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=69, test_size=0.33)

Logistic Regression

In [17]:
pipe = Pipeline(
        [('select', SelectFromModel(LogisticRegression(class_weight='balanced',
                                                  penalty="l1", C=0.01))),
        ('model', LogisticRegression(class_weight='balanced',penalty='l2'))])

param_grid = [{}] 
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total= 1.6min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.6min remaining:    0.0s


[CV] ................................................. , total= 1.6min
[CV]  ................................................................




[CV] ................................................. , total= 1.5min
[CV]  ................................................................
[CV] ................................................. , total= 2.2min
[CV]  ................................................................




[CV] ................................................. , total= 2.4min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  9.2min finished


In [18]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

              precision    recall  f1-score   support

           0       0.42      0.39      0.40      8186
           1       0.52      0.56      0.54     14805
           2       0.64      0.62      0.63     16957

   micro avg       0.55      0.55      0.55     39948
   macro avg       0.53      0.52      0.52     39948
weighted avg       0.55      0.55      0.55     39948



Random Forest

In [20]:
pipe = Pipeline(
        [('select', SelectFromModel(LogisticRegression(class_weight='balanced',
                                                  penalty="l1", C=0.01))),
        ('model', RandomForestClassifier(n_estimators=300, random_state=0))])

param_grid = [{}] # Optionally add parameters here
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total= 3.1min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.3min remaining:    0.0s


[CV] ................................................. , total= 2.3min
[CV]  ................................................................




[CV] ................................................. , total= 3.2min
[CV]  ................................................................




[CV] ................................................. , total= 3.4min
[CV]  ................................................................




[CV] ................................................. , total= 1.8min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 14.8min finished


In [21]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

              precision    recall  f1-score   support

           0       0.64      0.55      0.59      8186
           1       0.70      0.77      0.74     14805
           2       0.76      0.75      0.76     16957

   micro avg       0.72      0.72      0.72     39948
   macro avg       0.70      0.69      0.70     39948
weighted avg       0.72      0.72      0.72     39948



Linear SVC

In [23]:
pipe = Pipeline(
        [('select', SelectFromModel(LogisticRegression(class_weight='balanced',
                                                  penalty="l1", C=0.01))),
        ('model', LinearSVC(C=0.05,random_state=0))])

param_grid = [{}] # Optionally add parameters here
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.4min remaining:    0.0s


[CV] ................................................. , total= 2.4min
[CV]  ................................................................




[CV] ................................................. , total= 2.1min
[CV]  ................................................................




[CV] ................................................. , total= 2.5min
[CV]  ................................................................




[CV] ................................................. , total= 2.2min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 11.1min finished


[CV] ................................................. , total= 1.8min




In [24]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

              precision    recall  f1-score   support

           0       0.42      0.08      0.13      8186
           1       0.61      0.23      0.34     14805
           2       0.47      0.92      0.62     16957

   micro avg       0.49      0.49      0.49     39948
   macro avg       0.50      0.41      0.36     39948
weighted avg       0.51      0.49      0.42     39948



Extra Trees

In [26]:
pipe = Pipeline(
        [('select', SelectFromModel(LogisticRegression(class_weight='balanced',
                                                  penalty="l1", C=0.01))),
        ('model', ExtraTreeClassifier())])

param_grid = [{}] 
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total= 1.4min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.4min remaining:    0.0s


[CV] ................................................. , total= 1.3min
[CV]  ................................................................




[CV] ................................................. , total= 1.5min
[CV]  ................................................................




[CV] ................................................. , total= 1.1min
[CV]  ................................................................




[CV] ................................................. , total= 1.2min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.6min finished


In [27]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

              precision    recall  f1-score   support

           0       0.54      0.55      0.55      8186
           1       0.68      0.75      0.71     14805
           2       0.74      0.67      0.70     16957

   micro avg       0.67      0.67      0.67     39948
   macro avg       0.65      0.66      0.65     39948
weighted avg       0.68      0.67      0.67     39948



Naive Bayes

In [29]:
pipe = Pipeline(
        [('select', SelectFromModel(LogisticRegression(class_weight='balanced',
                                                  penalty="l1", C=0.01))),
        ('model', BernoulliNB())])

param_grid = [{}] # Optionally add parameters here
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=  49.7s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   49.8s remaining:    0.0s


[CV] ................................................. , total= 1.0min
[CV]  ................................................................




[CV] ................................................. , total= 2.3min
[CV]  ................................................................
[CV] ................................................. , total=  50.8s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.9min finished


[CV] ................................................. , total= 1.9min




In [30]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      8186
           1       0.44      0.76      0.56     14805
           2       0.65      0.55      0.60     16957

   micro avg       0.52      0.52      0.52     39948
   macro avg       0.36      0.44      0.39     39948
weighted avg       0.44      0.52      0.46     39948



  'precision', 'predicted', average, warn_for)


Random Forest

In [76]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [77]:
clf = RandomForestClassifier(n_estimators=300, random_state=0)
clf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [78]:
result = clf.predict(X_test)
report = classification_report( y_test, result )
print(report)

              precision    recall  f1-score   support

           0       0.64      0.55      0.59      8186
           1       0.70      0.77      0.73     14805
           2       0.76      0.75      0.75     16957

   micro avg       0.71      0.71      0.71     39948
   macro avg       0.70      0.69      0.69     39948
weighted avg       0.71      0.71      0.71     39948



Linear SVC

In [58]:
clf = LinearSVC(C=0.1,random_state=0)
clf.fit(X_train,y_train)



LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0)

In [59]:
result = clf.predict(X_test)
report = classification_report( y_test, result )
print(report)

              precision    recall  f1-score   support

           0       0.23      0.01      0.02      8186
           1       0.47      0.62      0.54     14805
           2       0.57      0.67      0.61     16957

   micro avg       0.52      0.52      0.52     39948
   macro avg       0.42      0.43      0.39     39948
weighted avg       0.46      0.52      0.46     39948



Extra Trees

In [52]:
clf=ExtraTreeClassifier()
clf.fit(X_train,y_train)

ExtraTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, random_state=None,
          splitter='random')

In [53]:
result = clf.predict(X_test)
report = classification_report( y_test, result )
print(report)

              precision    recall  f1-score   support

           0       0.54      0.56      0.55      8186
           1       0.67      0.74      0.71     14805
           2       0.74      0.66      0.70     16957

   micro avg       0.67      0.67      0.67     39948
   macro avg       0.65      0.65      0.65     39948
weighted avg       0.67      0.67      0.67     39948



Naive Bayes

In [54]:
clf = BernoulliNB() #Za bilo koi parametri isto e
clf.fit(X_train,y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [55]:
result = clf.predict(X_test)
report = classification_report( y_test, result )
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      8186
           1       0.44      0.76      0.56     14805
           2       0.65      0.55      0.60     16957

   micro avg       0.52      0.52      0.52     39948
   macro avg       0.36      0.44      0.39     39948
weighted avg       0.44      0.52      0.46     39948



  'precision', 'predicted', average, warn_for)
