## Test Maxime ML

In [1]:
%load_ext autoreload
%autoreload 2

#import warnings; warnings.simplefilter('ignore')
#import os, codecs, string, random
import numpy as np
import pandas as pd
#from numpy.random import seed as random_seed
#from numpy.random import shuffle as random_shuffle
import matplotlib.pyplot as plt
%matplotlib inline



import functools
import pickle
import nltk
import pickle
import scipy.sparse as sparse
from collections import Counter

#NLP libraries
import spacy, nltk, gensim, sklearn
import pyLDAvis.gensim_models

#Vader
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#Scikit imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

  from imp import reload


In [2]:
seed = 42
np.random.seed(seed)

In [3]:
#! python -m spacy validate
#nlp = spacy.load('data/en_core_web_sm-3.0.0')

In [4]:
with open('data/stopwords.pkl', 'rb') as f:
    stopwords = pickle.load(f)

In [5]:
def compute_accuracy(Y_pred, Y_true):
    return np.sum(Y_pred == Y_true) / len(Y_pred)

In [6]:
def count_parenthesis(tweet):
    count = 0
    for c in tweet:
        if c == '(':
            count += 1
        if c == ')':
            count -= 1
    return count

In [7]:
def has_parenthesis(tweet):
    for c in tweet:
        if c in "()":
            return True
    return False

In [8]:
def count_longest_rep(tweet):
    count = 1
    count_max = 1
    c_last = ''
    for c in tweet:
        if c == c_last:
            count += 1
        if c != c_last:
            count = 1
        c_last = c
        count_max = max(count_max, count)
    return count_max

In [9]:
DATA_FOLDER = 'data/'
TWITTER_FOLDER = DATA_FOLDER + 'twitter-datasets/'
EN_CORE_WEB_SM = DATA_FOLDER + 'en_core_web_sm-3.0.0'

# importing Data

In [33]:
pos = pd.read_csv(TWITTER_FOLDER + 'train_pos.txt', sep='\t', header=None, names=['tweet'])#[:10000]
neg = pd.read_csv(TWITTER_FOLDER + 'train_neg.txt', sep='\t', header=None, names=['tweet'])#[:10000]

In [123]:
pos_full = pd.read_csv(TWITTER_FOLDER + 'train_pos_full.txt', sep='\t', header=None, names=['tweet'])
neg_full = pd.read_csv(TWITTER_FOLDER + 'train_neg_full.txt', sep='\t', header=None, names=['tweet'], on_bad_lines='skip')

In [124]:
pos_full = pos_full.drop_duplicates()
neg_full = neg_full.drop_duplicates()

In [125]:
test_data = pd.read_csv(TWITTER_FOLDER + 'test_data.txt', sep='\t', header=None, names=['tweet'])

# Creating Test/Train

#### Adding some processing

In [126]:
pos_full['is_pos'] = 1
neg_full['is_pos'] = 0
tweets = pd.concat([pos_full, neg_full])

analyzer = SentimentIntensityAnalyzer()
polarity_scores = tweets['tweet'].apply(analyzer.polarity_scores)

tweets['neg']       = polarity_scores.apply(lambda d : d['neg'])
tweets['neu']       = polarity_scores.apply(lambda d : d['neu'])
tweets['pos']       = polarity_scores.apply(lambda d : d['pos'])
tweets['compound']  = polarity_scores.apply(lambda d : d['compound'])

tweets['par_count'] = tweets["tweet"].apply(count_parenthesis)
tweets['len_tweet'] = tweets["tweet"].apply(len)
tweets['rep_count'] = tweets["tweet"].apply(count_longest_rep)

#### Split test / train

In [127]:
X = tweets.drop(['is_pos'], axis=1)
Y = tweets.drop(['tweet'],  axis=1)
X, Y = shuffle(X, Y, random_state=42)
X_tr_df, X_te_df, Y_tr_df, Y_te_df = train_test_split(X, Y, test_size=0.1, random_state=42)

In [128]:
X_tr = X_tr_df['tweet']
X_te = X_te_df['tweet']

Y_tr = np.array(Y_tr_df['is_pos'])
Y_te = np.array(Y_te_df['is_pos'])

#### Vectorization

In [129]:
vectorizer = TfidfVectorizer(min_df=5, max_df=0.75, ngram_range=(1,5), strip_accents='unicode').fit(X_tr)

In [130]:
X_tr_vec = vectorizer.transform(X_tr)
X_te_vec = vectorizer.transform(X_te)

# Predictions

In [131]:
par_count_tr = np.array(X_tr_df['par_count']).reshape(-1,1)
par_count_te = np.array(X_te_df['par_count']).reshape(-1,1)

pol_score_tr = np.array(X_tr_df[['neg', 'neu', 'pos', 'compound']])
pol_score_te = np.array(X_te_df[['neg', 'neu', 'pos', 'compound']])

metadata_tr  = np.array(X_tr_df[['neg', 'neu', 'pos', 'compound', 'par_count', 'rep_count', ]])
metadata_te  = np.array(X_te_df[['neg', 'neu', 'pos', 'compound', 'par_count', 'rep_count']])

In [132]:
scaler_metadata    = preprocessing.StandardScaler().fit(metadata_tr)

metadata_scaled_tr = scaler_metadata.transform(metadata_tr) / 10
metadata_scaled_te = scaler_metadata.transform(metadata_te) / 10

pol_score_scaled_tr = metadata_scaled_tr[:, :4]
pol_score_scaled_te = metadata_scaled_te[:, :4]

In [133]:
X_tr_with_pole_score = sparse.hstack((X_tr_vec, pol_score_scaled_tr))
X_te_with_pole_score = sparse.hstack((X_te_vec, pol_score_scaled_te ))

X_tr_with_metadata   = sparse.hstack((X_tr_vec, metadata_scaled_tr))
X_te_with_metadata   = sparse.hstack((X_te_vec, metadata_scaled_te))

In [None]:
log_reg_1 = LogisticRegression()
log_reg_2 = LogisticRegression()
log_reg_3 = LogisticRegression(max_iter=1000)
log_reg_4 = LogisticRegression(max_iter=1000)
log_reg_5 = LogisticRegression(max_iter=1000)

log_reg_1.fit(pol_score_tr,         Y_tr)
log_reg_2.fit(metadata_tr,          Y_tr)
log_reg_3.fit(X_tr_vec,             Y_tr)
log_reg_4.fit(X_tr_with_pole_score, Y_tr)
log_reg_5.fit(X_tr_with_metadata,   Y_tr);

In [None]:
print(log_reg_1.score(pol_score_te,         Y_te))
print(log_reg_2.score(metadata_te,          Y_te))
print(log_reg_3.score(X_te_vec,             Y_te))
print(log_reg_4.score(X_te_with_pole_score, Y_te))
print(log_reg_5.score(X_te_with_metadata,   Y_te))

In [None]:
def cond_1(df):
    return np.array(df.par_count >  0)
def cond_2(df):
    return np.array(df.par_count <  0)
def cond_3(df):
    return np.array(df.par_count == 0)
def cond_3_1(df):
    return np.array((df.par_count == 0) & (df.compound == 0))
def cond_3_2(df):
    return np.array((df.par_count == 0) & (df.compound >  0))
def cond_3_3(df):
    return np.array((df.par_count == 0) & (df.compound <  0))

conditions = [cond_1, cond_2, cond_3]
conditions_X_tr = np.array([cond(X_tr_df) for cond in conditions])
conditions_X_te = np.array([cond(X_te_df) for cond in conditions])

In [None]:
X_tr_with_metadata = sparse.csr_matrix(X_tr_with_metadata)
log_reg_list = []
for cond in conditions_X_tr:
    X_tr_subset = X_tr_with_metadata[cond]
    Y_tr_subset = np.array(Y_tr_df['is_pos'][cond])
    clf = LogisticRegression(max_iter=1000).fit(X_tr_subset, Y_tr_subset)
    log_reg_list.append(clf)

In [None]:
Y_pred_list = []
for clf in log_reg_list:
    Y_pred_list.append(clf.predict(X_te_with_metadata))
Y_pred_list = np.array(Y_pred_list)

In [None]:
pred_multi = np.empty(len(X_te_df))
for i in range(len(pred_multi)):
    a = int(np.argwhere(conditions_X_te[:,i]))
    pred_multi[i] = Y_pred_list[a, i]

In [None]:
compute_accuracy(pred_multi, Y_te)

#### Results on the full dataset

log_reg_1.score (pol_score_te,         Y_te)  :  0.6507
log_reg_2.score (metadata_te,          Y_te)  :  0.7477
log_reg_3.score (X_te_vec,             Y_te)  :  0.8542
log_reg_4.score (X_te_with_pole_score, Y_te)  :  0.8551
log_reg_5.score (X_te_with_metadata,   Y_te)  :  0.8653
compute_accuracy(pred_multi,           Y_te)  :  0.8664

## Logistic Regression

Y_train_count_bal = np.array(Y_train['is_pos'][Y_train.parenthesis_count == 0])
Y_train_count_pos = np.array(Y_train['is_pos'][Y_train.parenthesis_count >  0])
Y_train_count_neg = np.array(Y_train['is_pos'][Y_train.parenthesis_count <  0])

X_train_count_bal_vec = X_train_all_vec[X_train.parenthesis_count == 0]
X_train_count_pos_vec = X_train_all_vec[X_train.parenthesis_count >  0]
X_train_count_neg_vec = X_train_all_vec[X_train.parenthesis_count <  0]

## SVM

In [23]:
from sklearn.svm import LinearSVC

In [41]:
svc = LinearSVC()
svc.fit(X_tr_with_metadata,   Y_tr);

In [42]:
svc.score(X_te_with_metadata, Y_te)

In [1]:
from sklearn.svm import NuSVC

In [41]:
nusvc = NuSVC(gamma="auto")
nusvc.fit(X_tr_with_metadata,   Y_tr);

In [42]:
nusvc.score(X_te_with_metadata, Y_te)

## RandomForest

In [29]:
rfc = RandomForestClassifier(max_depth=100, random_state=42)
rfc.fit(X_tr_with_metadata,   Y_tr);

In [30]:
rfc.score(X_te_with_metadata, Y_te)

0.7198558156064375

# SandBox

In [290]:
tweet = '(()())'

def test_parenthesis(tweet):
    count = 0
    for c in tweet:
        if c == '(':
            count += 1
        if c == ')':
            count -= 1
        if count < 0:
            return False
    return count == 0

def count_parenthesis(tweet):
    count = 0
    for c in tweet:
        if c == '(':
            count += 1
        if c == ')':
            count -= 1
    return count

In [262]:
res = tweets['Tweet'].apply(test_parenthesis)

In [283]:
not_closed = tweets[~res].copy()

In [292]:
not_closed['parenthesis_count'] = not_closed["Tweet"].apply(count_parenthesis)

In [338]:
balanced = not_closed[not_closed.parenthesis_count > 0]

In [339]:
1-np.average(np.array(balanced['is_pos']))

0.9683708952674591

In [340]:
0.14 * 0.88 + 0.86 * 0.968

0.95568

In [277]:
np.average(np.array(not_closed['is_pos']))

0.13920948374994746

In [8]:
with open('data/stopwords.pkl', 'rb') as f:
    stopwords = pickle.load(f)

In [259]:
a[a.par_count > 0].groupby('is_pos').count()

Unnamed: 0_level_0,par_count
is_pos,Unnamed: 1_level_1
0,35661
1,1170


In [260]:
a[a.par_count < 0].groupby('is_pos').count()

Unnamed: 0_level_0,par_count
is_pos,Unnamed: 1_level_1
0,637
1,4840


In [261]:
a[a.par_count == 0].groupby('is_pos').count()

Unnamed: 0_level_0,par_count
is_pos,Unnamed: 1_level_1
0,63702
1,93990


In [262]:
a.groupby('is_pos').count()

Unnamed: 0_level_0,par_count
is_pos,Unnamed: 1_level_1
0,100000
1,100000


In [275]:
len(np.array(tweets[a.par_count == 19]['tweet'])[0])

4849

In [282]:
tweets[tweets.rep_count > 2]

Unnamed: 0,tweet,is_pos,neg,neu,pos,compound,par_count,len_tweet,rep_count
3,<user> <user> thanks sir > > don't trip lil ma...,1,0.0,0.834,0.166,0.4926,0,78,3
4,visiting my brother tmr is the bestest birthda...,1,0.0,0.761,0.239,0.5826,0,66,3
10,<user> anddd to cheer #nationals2013 ?,1,0.0,0.602,0.398,0.5106,0,38,3
11,we send an invitation to shop on-line ! here y...,1,0.0,1.000,0.000,0.0000,0,111,3
15,like dammm <user> lexis u got a lot to say whe...,1,0.0,0.710,0.290,0.6486,0,65,3
...,...,...,...,...,...,...,...,...,...
99953,tripp lite n201 - 004 - bl cat 6 gigabit snagl...,0,0.0,0.887,0.113,0.5106,1,127,3
99957,russian-english dictionary of scientific and t...,0,0.0,1.000,0.000,0.0000,1,125,3
99959,<user> omg are people getting their slots alre...,0,0.0,0.896,0.104,0.4404,0,120,3
99962,fanfan la tulipe ( dvd legendary french star g...,0,0.0,1.000,0.000,0.0000,1,121,3


In [300]:
tweets['has_par'] = tweets['tweet'].apply(has_parenthesis)

In [309]:
tweets[tweets.has_par][['par_count', 'is_pos']].groupby('par_count').mean()

Unnamed: 0_level_0,is_pos
par_count,Unnamed: 1_level_1
-10,1.0
-9,1.0
-8,1.0
-7,1.0
-6,0.909091
-5,0.842105
-4,0.935484
-3,0.90056
-2,0.882414
-1,0.880129
