# Tweet sentiment prediction

In [33]:
%matplotlib inline

import gensim
import json
import nltk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pprint
import re
import string
import time
import os

from collections import defaultdict
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.ensemble import GradientBoostingClassifier

## Data parsing

### Feature extraction from each tweet

In [34]:
# Reading in all files
lines = []
file_path = 'tweets/'
files = [os.path.join(file_path, x) for x in os.listdir(file_path)]
for fle in files:
    tweets_file = open(fle, 'r')
    _temp = tweets_file.readlines()
    lines += _temp
    tweets_file.close()
print ("Number of tweets: %d" % len(lines))

Number of tweets: 15513


In [35]:
pprint.pprint(json.loads(lines[0].strip()))

{'contributors': None,
 'coordinates': None,
 'created_at': 'Tue Aug 15 17:58:26 +0000 2017',
 'entities': {'hashtags': [{'indices': [45, 52], 'text': 'patent'},
                           {'indices': [65, 68], 'text': 'IP'}],
              'symbols': [{'indices': [103, 108], 'text': 'GOOG'},
                          {'indices': [109, 112], 'text': 'FB'}],
              'urls': [{'display_url': 'iam-media.com/blog/Detail.as…',
                        'expanded_url': 'http://www.iam-media.com/blog/Detail.aspx?g=afc6cc58-706a-475d-906a-fd85bd1e49f1',
                        'indices': [113, 136],
                        'url': 'https://t.co/FiHWRiETq3'}],
              'user_mentions': [{'id': 108564136,
                                 'id_str': '108564136',
                                 'indices': [3, 16],
                                 'name': 'IAM',
                                 'screen_name': 'IAM_magazine'}]},
 'favorite_count': 0,
 'favorited': False,
 'geo': None,
 'id':

In [36]:
#TODO use all twitter files
data = defaultdict(dict)
i=0
for line in lines:
    tweet = json.loads(line.strip())
    if 'text' in tweet: # only messages contains 'text' field is a tweet
        ts = time.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
        data[i]["time"] = time.mktime(ts)  
        data[i]["text"] = tweet['text']
    if 'urls' in tweet['entities']:
        #print tweet['entities']['urls']
        data[i]["urls"] = len(tweet['entities']['urls'])
    if 'hashtags' in tweet['entities']:
        data[i]["hashtags"] = len(tweet['entities']['hashtags'])
    i += 1

In [37]:
print (data[0])
print (data[1])

{'time': 1502845106.0, 'text': 'RT @IAM_magazine: Exclusive: In major Valley #patent move Google #IP head Allen Lo is joining Facebook $GOOG $FB https://t.co/FiHWRiETq3', 'urls': 1, 'hashtags': 2}
{'time': 1502844958.0, 'text': 'RT @arnabch01: #investors massive bubble in #tech be careful $AAPL $GOOG $MSFT $AMZN $FB $NFLX $TSLA $CSCO $INTC $NVDA $ZNGA $ORCL $JD $MU…', 'urls': 0, 'hashtags': 2}


In [38]:
#working with text
#tokenizer for tweets
tknzr = TweetTokenizer(strip_handles=True) #(strip_handles=True, reduce_len=True)
stop_words = set(stopwords.words('english'))
corpus = []
for i, info in data.copy().items():  
    text = info['text'].lower() #.encode('utf-8').decode('ascii','ignore') # content of the tweet
    text = re.sub("http\S*", '', text) # remove urls
    text = re.sub("^rt", '', text) # remove rt
    text = text.replace('#', '') # remove hashtag
    text = re.sub('[^\w\s]', '', text) # remove all non-space and non-[a-zA-Z0-9_] characters
    text = re.sub('\d+', '', text)  # remove all numbers
    
    # REMOVING MISSPELLINGS WITH MULTIPLE CONTINUOUS LETTERS!
    text = re.sub('(.)\1+', '\1\1', text)
    words = tknzr.tokenize(text)
    
    text = " ".join(words) # .encode('utf-8')

    # REMOVING DUPLICATES!
    if text in corpus:
        del data[i]
        continue
    
    corpus.append(text)
    data[i]['text'] = text
    data[i]['exclamations'] = words.count('!')
    data[i]['questions'] = words.count('?')
    data[i]['dollar'] = words.count('$')
    data[i]['num_words'] = len(text) 
          

In [39]:
print (data[0])
print (data[1])
print (len(data))


{'time': 1502845106.0, 'text': 'iam_magazine exclusive in major valley patent move google ip head allen lo is joining facebook goog fb', 'urls': 1, 'hashtags': 2, 'exclamations': 0, 'questions': 0, 'dollar': 0, 'num_words': 102}
{'time': 1502844958.0, 'text': 'arnabch investors massive bubble in tech be careful aapl goog msft amzn fb nflx tsla csco intc nvda znga orcl jd mu', 'urls': 0, 'hashtags': 2, 'exclamations': 0, 'questions': 0, 'dollar': 0, 'num_words': 115}
8415


### Understanding the data

In [40]:
df = pd.DataFrame.from_dict(data, orient='index')
df.describe()

Unnamed: 0,time,urls,hashtags,exclamations,questions,dollar,num_words
count,8415.0,8415.0,8415.0,8415.0,8415.0,8415.0,8415.0
mean,1504406000.0,0.698633,0.525015,0.0,0.0,0.0,79.985027
std,929050.7,0.54222,1.302777,0.0,0.0,0.0,28.010856
min,1502506000.0,0.0,0.0,0.0,0.0,0.0,4.0
25%,1503618000.0,0.0,0.0,0.0,0.0,0.0,61.0
50%,1504391000.0,1.0,0.0,0.0,0.0,0.0,82.0
75%,1505260000.0,1.0,0.0,0.0,0.0,0.0,102.0
max,1505956000.0,3.0,12.0,0.0,0.0,0.0,140.0


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8415 entries, 0 to 15492
Data columns (total 8 columns):
time            8415 non-null float64
text            8415 non-null object
urls            8415 non-null int64
hashtags        8415 non-null int64
exclamations    8415 non-null int64
questions       8415 non-null int64
dollar          8415 non-null int64
num_words       8415 non-null int64
dtypes: float64(1), int64(6), object(1)
memory usage: 591.7+ KB


## Feature Extraction from Text

### Text features based on frequencies

In [42]:
#remove duplicates
df = df.drop_duplicates(subset=['text'], keep=False)
df.describe()
df = df.reset_index(drop=True)

In [43]:
from sklearn.feature_extraction.text import CountVectorizer

word_vectorizer = CountVectorizer(analyzer='word', stop_words='english')
sparse_matrix = word_vectorizer.fit_transform(df['text'])
frequencies = sum(sparse_matrix).toarray()[0]
words = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
print (words.describe())
words.head(10)

          frequency
count  12704.000000
mean       6.499292
std       39.621714
min        1.000000
25%        1.000000
50%        2.000000
75%        4.000000
max     1631.000000


Unnamed: 0,frequency
__,1
___,1
_anthrobear,1
_ayouba_,1
_free_,1
_jackmohr,1
_ms_izzy,1
_rone,1
_seandavid,4
_thethletter,1


### Smaller dictionary

In [44]:
word_vectorizer = CountVectorizer(analyzer='word', stop_words='english',min_df=2, max_df=3000)
sparse_matrix = word_vectorizer.fit_transform(df['text'])
frequencies = sum(sparse_matrix).toarray()[0]
words = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
print (words.describe())
words.head(10)

         frequency
count  6368.000000
mean     11.964353
std      55.427568
min       2.000000
25%       2.000000
50%       4.000000
75%       8.000000
max    1631.000000


Unnamed: 0,frequency
_seandavid,4
aa,17
aaba,5
aal,8
aaoi,7
aap,9
aapl,1631
aapls,11
aaron,2
ab,3


### Finding structure in text

In [45]:
del words
#create data_samples
#data_samples= [t['text'] for t in data.values()]

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)


In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

n_features = 250
n_components = 10
n_top_words = 10


### Counts
tf_vectorizer = CountVectorizer(min_df=2, max_df=1000, stop_words='english')
tf = tf_vectorizer.fit_transform(df.text)
tf_feature_names = tf_vectorizer.get_feature_names()
print (tf_feature_names[:40])

['_seandavid', 'aa', 'aaba', 'aal', 'aaoi', 'aap', 'aapls', 'aaron', 'ab', 'abb', 'abbv', 'abc', 'abco', 'abeo', 'abil', 'ability', 'able', 'abnormalreturns', 'aboard', 'abou', 'absolutely', 'abt', 'abvg', 'abx', 'academy', 'acadian', 'acanal', 'accelerates', 'accelerating', 'accelerator', 'acceleratorincubator', 'accenture', 'accept', 'access', 'accessories', 'according', 'accordingly', 'account', 'accounting', 'accounts']


### TF-IDF as text features

In [47]:
## TF-IDF
tfidf_vectorizer = TfidfVectorizer(min_df=2,max_df=1000,stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(df.text)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

## Finding topics with LDA

In [48]:
lda = LatentDirichletAllocation(n_components=n_components, learning_method='online')
lda.fit(tf) ## fitting counts
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0: new iphone ibm sell calls google bullish vs amp year
Topic #1: ibm shares business international machines corporation week amp sells look
Topic #2: ibm spy nflx twtr crm nvda qqq wmt brk amd
Topic #3: apple just amazon day short high going buy getting need
Topic #4: box twtr googl buy stocks check ai nice soon coming
Topic #5: management position microsoft alphabet llc stake million today corporation capital
Topic #6: trading tech googl amp long company free chart nflx bitcoin
Topic #7: market baba box googl options amp earnings spx car value
Topic #8: tesla big read ibm holdings trade price good cloud model
Topic #9: stocks stock time investing news snapchat like stockmarket says money


In [49]:
lda = LatentDirichletAllocation(n_components=n_components, learning_method='online')
lda.fit(tfidf) ## fitting tf-idf counts
print_top_words(lda, tfidf_feature_names, n_top_words)

Topic #0: know going hit need highs people timothysykes love launch articles
Topic #1: calls look cars buying electric sep gm day analysts future
Topic #2: spy nflx googl stocks free qqq market twtr stock nvda
Topic #3: ibm read tesla new box price blockchain check ai model
Topic #4: getting group trades ive largest great th strong amazing sell
Topic #5: box buy rating lol earnings hold weekly september stock lets
Topic #6: facebook googl box google data partners options value way car
Topic #7: good amazon high time stocks trade year snapchat chart close
Topic #8: holdings ibm twtr stocks big crm brk agn amd baba
Topic #9: corporation shares management business position llc microsoft alphabet stake international


Comparing the LDAs here with those obtained from the smaller tweet database, we don't see much discernible change. Maybe, if we include more data, the information could become clearer!

## Simple sentiment analysis

In [52]:
positive = pd.read_csv('positive-words.txt', names=['a'], encoding = "ISO-8859-1")
positive = set(positive['a'].tolist())
negative = pd.read_csv('negative-words.txt', names=['a'], encoding = "ISO-8859-1")
negative = set(negative['a'].tolist())

count_positive = []
count_negative = []
for i, row in df.iterrows():
    commonp = set(row['text'].split()).intersection(positive) 
    count_positive.append(len(commonp))
    commonn = set(row['text'].split()).intersection(negative) 
    count_negative.append(len(commonn))

df['positive'] = count_positive
df['negative'] = count_negative
df.head(10)

Unnamed: 0,time,text,urls,hashtags,exclamations,questions,dollar,num_words,positive,negative
0,1502845000.0,iam_magazine exclusive in major valley patent ...,1,2,0,0,0,102,0,0
1,1502845000.0,arnabch investors massive bubble in tech be ca...,0,2,0,0,0,115,0,0
2,1502845000.0,nyinvesting google goog is the embodiment of m...,0,6,0,0,0,114,2,1
3,1502845000.0,greenstocks timberr iwm spy tlt gs gld btc goo...,0,0,0,0,0,106,0,0
4,1502845000.0,bank of nova scotia buys shares of alphabet in...,1,0,0,0,0,52,0,0
5,1502845000.0,alphabet inc goog stake raised by north star a...,1,0,0,0,0,65,0,0
6,1502845000.0,themotleyfool the machines keep getting smarte...,1,0,0,0,0,94,2,0
7,1502844000.0,as alphabet goog valuation rose robshaw amp ju...,0,0,0,0,0,88,0,0
8,1502844000.0,warren averett asset management llc boosts pos...,1,0,0,0,0,72,0,0
9,1502844000.0,goog himx vuzi great article,1,0,0,0,0,28,1,0


In [53]:
df.describe()

Unnamed: 0,time,urls,hashtags,exclamations,questions,dollar,num_words,positive,negative
count,8415.0,8415.0,8415.0,8415.0,8415.0,8415.0,8415.0,8415.0,8415.0
mean,1504406000.0,0.698633,0.525015,0.0,0.0,0.0,79.985027,0.423173,0.273678
std,929050.7,0.54222,1.302777,0.0,0.0,0.0,28.010856,0.666,0.56529
min,1502506000.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
25%,1503618000.0,0.0,0.0,0.0,0.0,0.0,61.0,0.0,0.0
50%,1504391000.0,1.0,0.0,0.0,0.0,0.0,82.0,0.0,0.0
75%,1505260000.0,1.0,0.0,0.0,0.0,0.0,102.0,1.0,0.0
max,1505956000.0,3.0,12.0,0.0,0.0,0.0,140.0,6.0,4.0


### Positive sentiment

In [54]:
df[(df['positive'] >0) & (df['negative']  == 0)]['text'].head(50)

6      themotleyfool the machines keep getting smarte...
9                           goog himx vuzi great article
10     robertrelder apples bargaining power rising go...
14     arnabch ai robotics bigdata genomics stemcell ...
15     applewatch to support both lte and nonlte mode...
20     beijing transit contactless mpayment system ex...
26     tweaktown pr asrockinfo introduces the x iot r...
29     pr asrockinfo introduces the x iot router for ...
41     stocktwits since its ipo home depot is actuall...
45     edborgato amzns same day pick up locations are...
49     would be amazed if jana partners manage to sel...
56           gs aapl amzn need to lead us higher spx dji
58     active traders try one of these free trading g...
59     xplr join us for play by play action on stocks...
60     amzn pzza restaurants are in a tech race to ma...
68     amzn part bmark offering guidance y y y y y y ...
73     hot options alert midday tuesday august bac dk...
77     there is a chance apple 

### Negative sentiment

In [55]:
df[(df['positive'] ==0) & (df['negative']  > 0)]['text'].head(20)

18     goog neonazi group moves to dark web after web...
30     arnabch will advances in ai ml robotics nanote...
33     arnabch hpc ai ml bigdata may soon enable geno...
39     discussing the retail landscape department sto...
48     sitrep risk on mrk ceo youre fired amzn gs leg...
54     amzn aap wmt amazon will probably go onto crus...
55     dont worry about how many shares you can buy c...
67     thestreet amazon will probably go onto crush a...
83     tsla sa another risk factor for tesla shorts d...
99     microsoft acquires cloudcomputing orchestratio...
115    international business machines ibm fall to no...
117             the blue cloud collapses i told u ibm so
121    jimcramer mariabartiromo so u wont ask ginni a...
122    seekingalpha ibm watson disappointment risks f...
123    ibm watson disappointment risks further downwa...
128    china big market thus saith ginni so far zero ...
131    marketsupchuck is ibms dividend yield killing ...
132    is ibms dividend yield k

In [56]:
print ("Total tweets:", len(df))
print ("Total tweets positive:",len(df[(df['positive'] >0) & (df['negative']  == 0)]))
print ("Total tweets negative:",len(df[(df['positive'] == 0) & (df['negative']  > 0)]))
print ("Tweets with no info:", len(df[(df['positive'] == 0) & (df['negative']  == 0)]))
print ("neutral tweets:", len(df[(df['positive'] >0) & (df['negative']  > 0)]))

Total tweets: 8415
Total tweets positive: 2212
Total tweets negative: 1227
Tweets with no info: 4339
neutral tweets: 637


## Content Similarity using word embedings (Word2Vec)

In [None]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, )

### Converting each tweet into a vector

In [57]:
from gensim import matutils

stop_words = set(stopwords.words('english'))

matrix = []
filtered = []
useless_indx = []
counter = 0

for i, row in data.items():
    filtered_text = [model[w] for w in row['text'].split() if w in model and w not in stop_words]
    filtered.append([w for w in row['text'].split() if w in model and w not in stop_words])
    if len(filtered_text):
        matrix.append(matutils.unitvec(np.array(filtered_text).mean(axis=0)))
    else:
        useless_indx.append(counter)
    counter += 1
    
df = df.drop(df.index[useless_indx], inplace = False)
df = df.reset_index()

In [58]:
#computing similarity between tweets

matrix = np.array(matrix)
sim = np.dot(matrix, matrix.transpose())
print (sim)

[[1.         0.672333   0.60287177 ... 0.34146225 0.54058195 0.20146088]
 [0.672333   1.         0.61664081 ... 0.27026728 0.44768505 0.22181737]
 [0.60287177 0.61664081 1.         ... 0.26864053 0.44867838 0.2980352 ]
 ...
 [0.34146225 0.27026728 0.26864053 ... 1.         0.58074919 0.64923257]
 [0.54058195 0.44768505 0.44867838 ... 0.58074919 1.         0.39800358]
 [0.20146088 0.22181737 0.2980352  ... 0.64923257 0.39800358 1.        ]]


In [59]:
#reshaping into a data frame
print (sim.shape)
dup = np.fill_diagonal(sim, 0)

simdf = pd.DataFrame(list(sim[np.triu_indices(sim.shape[1], 1)]))
simdf.describe()

(8411, 8411)


Unnamed: 0,0
count,35368260.0
mean,0.3883489
std,0.1193082
min,-0.1521175
25%,0.3084871
50%,0.3881512
75%,0.4674592
max,1.0


Since the dimension of the transformed matrix post word2vec above is 4 less than expected (8411 v/s 8415), I also remove the relevant row items from *df*.

### Get the most similar tweets for each sentiment

In [60]:
pos = 41
most_similar = np.argmax(sim[pos][:])
print ("similarity:", sim[pos][most_similar])
print (df.iloc[pos]['text'])
print (filtered[pos])
print (df.iloc[most_similar]['text'])
print (filtered[most_similar])

similarity: 0.6785147315887967
stocktwits since its ipo home depot is actually outperforming amazon compare the green to the yellow line on this
['since', 'ipo', 'home', 'depot', 'actually', 'outperforming', 'amazon', 'compare', 'green', 'yellow', 'line']
ndygrosso true fb also traded below its ipo price but then reclaimed it over gain since waiting for twtr
['remember', 'fb', 'ipo', 'price', 'wanted', 'look', 'going']


In [61]:
neg = 55
most_similar = np.argmax(sim[neg][:])
print ("similarity:", sim[neg][most_similar])
print (df.iloc[neg]['text'])
print (filtered[neg])
print (df.iloc[most_similar]['text'])
print (filtered[most_similar])

similarity: 0.8729076740177735
dont worry about how many shares you can buy concern yourself wthe return on those shares stocks amzn googl
['dont', 'worry', 'many', 'shares', 'buy', 'concern', 'wthe', 'return', 'shares', 'stocks']
dont stay away from stocks with a high share price its okay to only buy a couple of shares stocks amzn googl
['need', 'since', 'walmart', 'wmt', 'paid', 'b', 'corporate', 'income', 'tax', 'amazon', 'paid', 'b', 'amazon']


### Create a labelled dataset, and split into train/test

The data is slightly imbalanced, since positive-to-negative tweet ratio is ~1.8. But this is not typically the skewed ratio that is cause for concern unlike in fraud detection problems.

In [62]:
df['label'] = 1 * ((df['positive'] >0) & (df['negative']  == 0)) - 1 * ((df['positive'] == 0) & (df['negative']  > 0))
df_model = df.drop(columns = ['positive', 'negative', 'exclamations', 'questions', 'dollar', 'text'], inplace = False)
df_model = df_model[~((df['positive'] > 0) & (df['negative']  > 0))]    # Removing amiguous/neutral data
df_no_info = df_model[df_model['label'] == 0]
df_model = df_model[df_model['label'] != 0]

matrix = pd.DataFrame(matrix, columns = ['X' + str(x) for x in range(matrix.shape[1])])
df_model = df_model.join(matrix, how = 'left')
df_no_info = df_no_info.join(matrix, how = 'left')

X = df_model.drop(columns = ['label'], inplace = False)
y = df_model['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

for n_estimators in [100, 300, 500]:
    for max_depth in [3, 7, 10]:
        clf = GradientBoostingClassifier(n_estimators = n_estimators,
                                         max_depth = max_depth,
                                         learning_rate = 0.1)
        clf.fit(X_train, y_train)
        y_predict_class = clf.predict(X_test)
        print ("Accuracy score for %d estimators with max tree depth of %d: %f"
               % (n_estimators, max_depth, accuracy_score(y_test, y_predict_class)))

Accuracy score for 100 estimators with max tree depth of 3: 0.821221
Accuracy score for 100 estimators with max tree depth of 7: 0.848837
Accuracy score for 100 estimators with max tree depth of 10: 0.841570
Accuracy score for 300 estimators with max tree depth of 3: 0.869186
Accuracy score for 300 estimators with max tree depth of 7: 0.860465
Accuracy score for 300 estimators with max tree depth of 10: 0.848837
Accuracy score for 500 estimators with max tree depth of 3: 0.876453
Accuracy score for 500 estimators with max tree depth of 7: 0.853198
Accuracy score for 500 estimators with max tree depth of 10: 0.840116


I train a Gradient Boosting model for multiple hyper-parameter combinations to see what would give good OOS results. The best one will be selected for the task of labelling the tweets without any label information. The predicted class can be found in the 'label' column of df_no_info.

As we see from above results, as we increase number of trees, keeping small trees in place, the model performs better. Thus, I choose an n_estimators of 1000 and max_depth of 3 below


### Classifying tweets with no information

In [66]:
clf = GradientBoostingClassifier(n_estimators = 1000, max_depth = 3, learning_rate = 0.1)
clf.fit(X_train, y_train)
y_predict_class = clf.predict(X_test)
print ("Accuracy score for 1000 estimators with max tree depth of 3: %f" % accuracy_score(y_test, y_predict_class))
df_no_info['label'] = clf.predict(df_no_info.drop(columns = ['label'], inplace = False))
df_no_info.head()

Accuracy score for 1000 estimators with max tree depth of 3: 0.886628


Unnamed: 0,index,time,urls,hashtags,num_words,label,X0,X1,X2,X3,...,X290,X291,X292,X293,X294,X295,X296,X297,X298,X299
0,0,1502845000.0,1,2,102,-1,-0.002412,0.012745,-0.009593,0.092464,...,0.068362,0.093186,-0.058904,0.054739,-0.014146,-0.089333,0.009702,-0.032968,-0.034036,0.050125
1,1,1502845000.0,0,2,115,1,0.057487,0.035829,0.0006,0.101208,...,0.025843,0.022043,-0.021395,-0.029454,0.013659,-0.083605,-0.048005,-0.015275,-0.077464,0.041782
3,3,1502845000.0,0,0,106,-1,-0.063916,0.053886,-0.007517,0.047148,...,0.045912,0.0881,-0.050449,0.030778,0.00932,-0.049032,-0.067791,-0.054342,-0.07633,0.069449
4,4,1502845000.0,1,0,52,1,-0.008953,-0.012955,-0.008861,0.060886,...,0.014254,-0.002642,-0.156609,0.008664,0.088631,-0.031812,0.007395,0.056196,0.010431,0.066509
5,5,1502845000.0,1,0,65,1,-0.050434,0.006423,-0.011624,-1.4e-05,...,-0.030207,0.037583,-0.138747,0.028643,0.043147,-0.103074,0.004515,0.023327,-0.008192,-0.009076


We can see the labelled prediction for each tweet in df_no_info. I recommend using something like the GradientBoostedClassifier, since boosting and other ensemble methods tend to outperform generally and specially in cases of imbalanced datasets. We see that the accuracy on the test set for our trained model is ~88.7%