## Test Maxime ML

In [1]:
%load_ext autoreload
%autoreload 2

#import warnings; warnings.simplefilter('ignore')
#import os, codecs, string, random
import numpy as np
import pandas as pd
#from numpy.random import seed as random_seed
#from numpy.random import shuffle as random_shuffle
import matplotlib.pyplot as plt
%matplotlib inline



import functools
import pickle
import nltk
import pickle
import scipy.sparse as sparse
from collections import Counter

#NLP libraries
import spacy, nltk, gensim, sklearn
import pyLDAvis.gensim_models

#Vader
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#Scikit imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

  from imp import reload


In [2]:
seed = 42
np.random.seed(seed)

In [3]:
#! python -m spacy validate
#nlp = spacy.load('data/en_core_web_sm-3.0.0')

In [4]:
with open('data/stopwords.pkl', 'rb') as f:
    stopwords = pickle.load(f)

In [5]:
def compute_accuracy(Y_pred, Y_true):
    return np.sum(Y_pred == Y_true) / len(Y_pred)

In [6]:
def count_parenthesis(tweet):
    count = 0
    for c in tweet:
        if c == '(':
            count += 1
        if c == ')':
            count -= 1
    return count

In [101]:
tweet = 'aksfjhd HHSIF dsfj K'

In [102]:
count_longest_rep(tweet)

2

In [99]:
def count_longest_rep(tweet):
    count = 1
    count_max = 1
    c_last = ''
    for c in tweet:
        if c == c_last:
            count += 1
        if c != c_last:
            count = 1
        c_last = c
        count_max = max(count_max, count)
    return count_max

In [9]:
DATA_FOLDER = 'data/'
TWITTER_FOLDER = DATA_FOLDER + 'twitter-datasets/'
EN_CORE_WEB_SM = DATA_FOLDER + 'en_core_web_sm-3.0.0'

# importing Data

In [10]:
pos = pd.read_csv(TWITTER_FOLDER + 'train_pos.txt', sep='\t', header=None, names=['tweet'])#[:10000]
neg = pd.read_csv(TWITTER_FOLDER + 'train_neg.txt', sep='\t', header=None, names=['tweet'])#[:10000]

In [130]:
pos_full = pd.read_csv(TWITTER_FOLDER + 'train_pos_full.txt', sep='\t', header=None, names=['tweet'])#[:10000]
neg_full = pd.read_csv(TWITTER_FOLDER + 'train_neg_full.txt', sep='\t', header=None, names=['tweet'], on_bad_lines='skip')#[:10000]

In [131]:
test_data = pd.read_csv(TWITTER_FOLDER + 'test_data.txt', sep='\t', header=None, names=['tweet'])

# Creating Test/Train

#### Adding some processing

In [132]:
pos_full['is_pos'] = 1
neg_full['is_pos'] = 0
tweets = pd.concat([pos_full, neg_full])

analyzer = SentimentIntensityAnalyzer()
polarity_scores = tweets['tweet'].apply(analyzer.polarity_scores)

tweets['neg']       = polarity_scores.apply(lambda d : d['neg'])
tweets['neu']       = polarity_scores.apply(lambda d : d['neu'])
tweets['pos']       = polarity_scores.apply(lambda d : d['pos'])
tweets['compound']  = polarity_scores.apply(lambda d : d['compound'])

tweets['par_count'] = tweets["tweet"].apply(count_parenthesis)
tweets['len_tweet'] = tweets["tweet"].apply(len)
tweets['rep_count'] = tweets["tweet"].apply(count_longest_rep)

#### Split test / train

In [133]:
X = tweets.drop(['is_pos'], axis=1)
Y = tweets.drop(['tweet'],  axis=1)
X, Y = shuffle(X, Y, random_state=42)
X_tr_df, X_te_df, Y_tr_df, Y_te_df = train_test_split(X, Y, test_size=0.1, random_state=42)

In [134]:
X_tr = X_tr_df['tweet']
X_te = X_te_df['tweet']

Y_tr = np.array(Y_tr_df['is_pos'])
Y_te = np.array(Y_te_df['is_pos'])

#### Vectorization

In [135]:
vectorizer = TfidfVectorizer(lowercase=False, min_df=20, max_df=0.5, ngram_range=(1,4)).fit(X_tr)

In [136]:
X_tr_vec = vectorizer.transform(X_tr)
X_te_vec = vectorizer.transform(X_te)

# Predictions

In [137]:
par_count_tr = np.array(X_tr_df['par_count']).reshape(-1,1)
par_count_te = np.array(X_te_df['par_count']).reshape(-1,1)

pol_score_tr = np.array(X_tr_df[['neg', 'neu', 'pos', 'compound']])
pol_score_te = np.array(X_te_df[['neg', 'neu', 'pos', 'compound']])

metadata_tr  = np.array(X_tr_df[['neg', 'neu', 'pos', 'compound', 'par_count', 'rep_count', ]])
metadata_te  = np.array(X_te_df[['neg', 'neu', 'pos', 'compound', 'par_count', 'rep_count']])

In [138]:
scaler_metadata    = preprocessing.StandardScaler().fit(metadata_tr)

metadata_scaled_tr = scaler_metadata.transform(metadata_tr) / 10
metadata_scaled_te = scaler_metadata.transform(metadata_te) / 10

pol_score_scaled_tr = metadata_scaled_tr[:, :4]
pol_score_scaled_te = metadata_scaled_te[:, :4]

In [139]:
X_tr_with_pole_score = sparse.hstack((X_tr_vec, pol_score_scaled_tr))
X_te_with_pole_score = sparse.hstack((X_te_vec, pol_score_scaled_te ))

X_tr_with_metadata   = sparse.hstack((X_tr_vec, metadata_scaled_tr))
X_te_with_metadata   = sparse.hstack((X_te_vec, metadata_scaled_te))

In [140]:
log_reg_1 = LogisticRegression()
log_reg_2 = LogisticRegression()
log_reg_3 = LogisticRegression(max_iter=1000)
log_reg_4 = LogisticRegression(max_iter=1000)
log_reg_5 = LogisticRegression(max_iter=1000)

log_reg_1.fit(pol_score_tr,         Y_tr)
log_reg_2.fit(metadata_tr,          Y_tr)
log_reg_3.fit(X_tr_vec,             Y_tr)
log_reg_4.fit(X_tr_with_pole_score, Y_tr)
log_reg_5.fit(X_tr_with_metadata,   Y_tr);

In [141]:
print(log_reg_1.score(pol_score_te,         Y_te))
print(log_reg_2.score(metadata_te,          Y_te))
print(log_reg_3.score(X_te_vec,             Y_te))
print(log_reg_4.score(X_te_with_pole_score, Y_te))
print(log_reg_5.score(X_te_with_metadata,   Y_te))

0.650722043688728
0.7436927958345197
0.8541919212463898
0.8550909164870032
0.8652524102021723


In [142]:
def cond_1(df):
    return np.array(df.par_count >  0)
def cond_2(df):
    return np.array(df.par_count <  0)
def cond_3(df):
    return np.array(df.par_count == 0)
def cond_3_1(df):
    return np.array((df.par_count == 0) & (df.compound == 0))
def cond_3_2(df):
    return np.array((df.par_count == 0) & (df.compound >  0))
def cond_3_3(df):
    return np.array((df.par_count == 0) & (df.compound <  0))

conditions = [cond_1, cond_2, cond_3]
conditions_X_tr = np.array([cond(X_tr_df) for cond in conditions])
conditions_X_te = np.array([cond(X_te_df) for cond in conditions])

In [143]:
X_tr_with_metadata = sparse.csr_matrix(X_tr_with_metadata)
log_reg_list = []
for cond in conditions_X_tr:
    X_tr_subset = X_tr_with_metadata[cond]
    Y_tr_subset = np.array(Y_tr_df['is_pos'][cond])
    clf = LogisticRegression(max_iter=1000).fit(X_tr_subset, Y_tr_subset)
    log_reg_list.append(clf)

In [144]:
Y_pred_list = []
for clf in log_reg_list:
    Y_pred_list.append(clf.predict(X_te_with_metadata))
Y_pred_list = np.array(Y_pred_list)

In [145]:
pred_multi = np.empty(len(X_te_df))
for i in range(len(pred_multi)):
    a = int(np.argwhere(conditions_X_te[:,i]))
    pred_multi[i] = Y_pred_list[a, i]

In [146]:
compute_accuracy(pred_multi, Y_te)

0.8665053085465566

#### Results on the full dataset

log_reg_1.score (pol_score_te,         Y_te)  :  0.6507
log_reg_2.score (metadata_te,          Y_te)  :  0.7477
log_reg_3.score (X_te_vec,             Y_te)  :  0.8542
log_reg_4.score (X_te_with_pole_score, Y_te)  :  0.8551
log_reg_5.score (X_te_with_metadata,   Y_te)  :  0.8653
compute_accuracy(pred_multi,           Y_te)  :  0.8664

## Logistic Regression

Y_train_count_bal = np.array(Y_train['is_pos'][Y_train.parenthesis_count == 0])
Y_train_count_pos = np.array(Y_train['is_pos'][Y_train.parenthesis_count >  0])
Y_train_count_neg = np.array(Y_train['is_pos'][Y_train.parenthesis_count <  0])

X_train_count_bal_vec = X_train_all_vec[X_train.parenthesis_count == 0]
X_train_count_pos_vec = X_train_all_vec[X_train.parenthesis_count >  0]
X_train_count_neg_vec = X_train_all_vec[X_train.parenthesis_count <  0]

### Log reg different

In [76]:
X_test_all_vec.shape

NameError: name 'X_test_all_vec' is not defined

In [77]:
count_train = np.array(X_train['parenthesis_count']).reshape(-1,1)
count_test  = np.array(X_test ['parenthesis_count']).reshape(-1,1)

NameError: name 'X_train' is not defined

In [78]:
X_train_all_vec_1 = sparse.hstack((X_train_all_vec, count_train))
X_test_all_vec_1  = sparse.hstack((X_test_all_vec,  count_test ))

NameError: name 'X_train_all_vec' is not defined

In [50]:
log_reg_1 = LogisticRegression()
log_reg_2 = LogisticRegression()

log_reg_1.fit(X_train_all_vec_1, Y_train_all)
log_reg_2.fit(X_train_all_vec,   Y_train_all);

In [51]:
log_reg_1.score(X_test_all_vec_1, Y_test_all)

0.8405340914860131

In [52]:
log_reg_2.score(X_test_all_vec, Y_test_all)

0.8249987307711835

## SVM

In [23]:
from sklearn.svm import SVC

In [41]:
n = 10000
svc = SVC()
svc.fit(X_train_all_vec[:n], Y_train_all[:n]);

In [42]:
pred_svc = svc.predict(X_test_all_vec)

In [43]:
compute_accuracy(pred_svc, Y_test_all)

0.7787480326953343

## Knn

In [18]:
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier

In [19]:
pca = TruncatedSVD(n_components=10)
pca.fit(X_train_all_vec);

In [19]:
np.sum(pca.explained_variance_)

0.03675965786462883

In [20]:
X_train_pca = pca.transform(X_train_all_vec)
X_test_pca  = pca.transform(X_test_all_vec)

In [21]:
neigh = KNeighborsClassifier(n_neighbors=10)
neigh.fit(X_train_pca, Y_train_all);

In [26]:
n = 10000
pred_knn = neigh.predict(X_test_pca)

In [27]:
compute_accuracy(pred_knn, Y_test_all)

0.6665482053104533

## Others Less efficient methods

In [29]:
clf = RandomForestClassifier(max_depth=10, random_state=42)
clf.fit(X_train_all_vec, Y_train_all);

In [30]:
clf.score(X_test_all_vec, Y_test_all)

0.7198558156064375

In [191]:
reg = LinearRegression()
reg.fit(X_train_1, Y_train);

In [192]:
pred = reg.predict(X_test_1)
pred[pred >= 0.5] = 1
pred[pred <  0.5] = 0

# Tests

In [290]:
tweet = '(()())'

def test_parenthesis(tweet):
    count = 0
    for c in tweet:
        if c == '(':
            count += 1
        if c == ')':
            count -= 1
        if count < 0:
            return False
    return count == 0

def count_parenthesis(tweet):
    count = 0
    for c in tweet:
        if c == '(':
            count += 1
        if c == ')':
            count -= 1
    return count

In [262]:
res = tweets['Tweet'].apply(test_parenthesis)

In [283]:
not_closed = tweets[~res].copy()

In [292]:
not_closed['parenthesis_count'] = not_closed["Tweet"].apply(count_parenthesis)

In [338]:
balanced = not_closed[not_closed.parenthesis_count > 0]

In [339]:
1-np.average(np.array(balanced['is_pos']))

0.9683708952674591

In [340]:
0.14 * 0.88 + 0.86 * 0.968

0.95568

In [277]:
np.average(np.array(not_closed['is_pos']))

0.13920948374994746

## Pre-processing

In [8]:
with open('data/stopwords.pkl', 'rb') as f:
    stopwords = pickle.load(f)

#### Helper functions

``apply_functs_one_elem`` apply a list of function in the order theses are given in the array ``funct`` to one elem ``x``.

``apply_functs_array`` apply theses functions for each elem of the given array ``array``.

In [10]:
def apply_functs_one_elem(x, functs):
        for f in functs:
            x = f(x)
        return x

def apply_functs_array(array, functs):
    return [apply_functs_one_elem(desc, functs) for desc in array]

Below are simple helper function that will be used to map a description ``desc`` or bag of words ``bag``.

In [11]:
def lower(desc):
    return desc.lower()

#alphabet = "abcdefghijklmnopqrstuvwxyz 0123456789"
alphabet = "abcdefghijklmnopqrstuvwxyz "
def remove_punct(desc):
    return ''.join(c if c in alphabet else ' ' for c in desc)

def desc_to_bow(desc):
    return [word for word in desc.split(" ") if len(word)>0]

def remove_stopwords(bow):
    return [word for word in bow if word not in stopwords]

Below are the stemer and teh lematizer which take a bag of words ``bow`` and lemmatize/stem aech words.

In [12]:
ps = nltk.PorterStemmer()
def stem(bow):
    return [ps.stem(word) for word in bow]

lem = nltk.WordNetLemmatizer()
def lemmatize(bow):
    return [lem.lemmatize(word, pos="v") for word in bow]

``remove_words_by_freq`` is used to remove very frequent words and unfrequent words.

The words that will be removed are determined by the function ``get_words_by_freq`` with the initial array of bag of words ``bows``.
It get the frequency of a word by looking in how many descriptions this word appears.
It keeps the words that appears in at least ``freq_min``descriptions and at most in ``freq_max``descriptions.

``remove_words_by_freq`` will then only keep words that appears in the result of ``get_words_by_freq``.

In [43]:
def get_words_by_freq(bows, freq_min, count_by_doc=True):
    if count_by_doc:
        bows = apply_functs_array(bows, [np.unique])
    words, freq = np.unique(np.concatenate(bows), return_counts=True)
    return words[freq >= freq_min]


def remove_unfrequent_words(bows, freq_min=10, count_by_doc=True):
    words_in_freq = get_words_by_freq(bows, freq_min, count_by_doc)
    def remove_words_by_freq_1d(bow):
        return np.array([word for word in bow if word in words_in_freq])
    return np.array([remove_words_by_freq_1d(bow) for bow in bows], dtype=object)

``create_ngrams`` create ngrams for a given ``bows`` and ``n``.

``add_ngrams`` will then add the ngrams (where $n = 2,3, \ldots,$ ``max_n``) from ``bows`` to an the ``initial_array`` where the ngrams are filtered by frequencs the same way words are filtered by frequency.

In [14]:
def join_tuples(ngram):
    return [' '.join(tupl) for tupl in ngram]

def create_ngrams(bows, n):
    ngrams = [np.array(list(nltk.ngrams(bow,n))) for bow in bows]
    return [join_tuples(ngram) for ngram in ngrams]

def add_ngrams(initial_array, bows, min_freq, max_n):
    res = initial_array
    for n in range(2,max_n+1):
        ngrams = create_ngrams(bows, n)
        r = remove_unfrequent_words(ngrams, min_freq)
        res = [np.append(res[i], r[i]) for i in range(len(res))]
    return res

### Cleaning the data for courses

In [53]:
tweets_pos = np.empty(len(train_pos), dtype=object)
for i in range(len(train_pos)):
    tweets_pos[i] = train_pos.iloc[i]['Tweet']

In [54]:
bows = apply_functs_array(tweets_pos, [lower,
                                       remove_punct,
                                       desc_to_bow,
                                       lemmatize,
                                       remove_stopwords,
                                       stem
                                       ])

In [55]:
words_in_freq = get_words_by_freq(bows, 5, count_by_doc=False)

In [56]:
words_in_freq

array(['a', 'aa', 'aaa', ..., 'zy', 'zz', 'zzz'], dtype='<U133')

In [44]:
bows_freq = remove_unfrequent_words(bows, 5, count_by_doc=False)

In [45]:
bows_freq

array([array(['user', 'dunno', 'justin', 'read', 'mention', 'justin', 'god',
              'hope', 'follow'], dtype='<U7')                               ,
       array(['logic', 'dumb', 'win', 'crop', 'photo', 'tsk', 'url'], dtype='<U5'),
       array(['user', 'put', 'casper', 'box', 'loov', 'battl'], dtype='<U6'),
       ...,
       array(['user', 'user', 'um', 'read', 'profil', 'lunch', 'rider', 'fan'],
             dtype='<U6')                                                      ,
       array(['user', 'excit', 'tomorrow', 'xx'], dtype='<U8'),
       array(['job', 'applic', 'hooter', 'give', 'bra', 'fill'], dtype='<U6')],
      dtype=object)

In [18]:
bows_with_ngrams = add_ngrams(bows_freq, bows, 5, len(bows), 2)

In [19]:
bows_with_ngrams

[array(['user', 'dunno', 'justin', 'read', 'mention', 'justin', 'god',
        'hope', 'follow', 'user dunno', 'hope follow'], dtype='<U11'),
 array(['dumb', 'win', 'crop', 'photo', 'url'], dtype='<U32'),
 array(['user', 'put', 'box', 'battl', 'user put'], dtype='<U8'),
 array(['user', 'user', 'sir', 'don', 'trip', 'lil', 'mama', 'doin', 'ya',
        'thang', 'user user', 'user sir'], dtype='<U9'),
 array(['visit', 'brother', 'tmr', 'bestest', 'birthday', 'gift',
        'birthday gift'], dtype='<U13'),
 array(['user', 'yay', 'tweet', 'facebook', 'user yay'], dtype='<U8'),
 array(['user', 'dnextalbumtitl', 'feel', 'life', 'song', 'life', 'yolo',
        'famou'], dtype='<U32'),
 array(['workin', 'hard', 'workin', 'rt', 'user', 'futur', 'user',
        'rt user'], dtype='<U7'),
 array(['user', 'll', 'repli', 'bite', 'user ll'], dtype='<U7'),
 array(['belong'], dtype='<U32'),
 array(['user', 'cheer', 'nation'], dtype='<U32'),
 array(['send', 'invit', 'shop', 'line', 'find', 'leav', 'hom

## Term-document matrix

In [20]:
def count_occurences(bow, codewords):
    line = np.array([np.argwhere(word==codewords)[0,0] for word in bow])
    line = np.append(line,len(codewords))
    return np.bincount(line)[0:-1]

In [21]:
def IDF(X):
    N = len(bows)
    n = np.sum(np.where(X>0,1,0),axis=0)
    return -np.log(n/N)

In [22]:
def TF(X):
    max_f = np.max(X, axis=0)
    return X/max_f

In [23]:
def create_TF_IDF(bows, codewords):
    X = np.array([count_occurences(bow, codewords) for bow in bows])
    return sparse.csr_matrix(TF(X)*IDF(X))

#### Creating X

In [24]:
codewords = np.unique(np.concatenate(bows_with_ngrams))

X = create_TF_IDF(bows_with_ngrams, codewords)

TypeError: Cannot cast array data from dtype('float64') to dtype('int64') according to the rule 'safe'