In [25]:
import pandas as pd
import numpy as np
from utils import delete_columns, preprocess_tweet_text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [26]:
colnames = ["polarity", "id", "date", "query", "user", "text"]
df = pd.read_csv("data/Data_tweets.csv",names=colnames)

In [27]:
df

Unnamed: 0,polarity,id,date,query,user,text
0,0,2068921155,Sun Jun 07 14:56:42 PDT 2009,NO_QUERY,smiley_sophie,my arm still hurts from when i pulled it yeste...
1,4,2065871668,Sun Jun 07 09:27:21 PDT 2009,NO_QUERY,ImmaChocoholic,I have so much to do outside! Been looking at ...
2,0,1835774749,Mon May 18 06:43:27 PDT 2009,NO_QUERY,drmomentum,"@AbsolutSara Yes, I knew about the clusterfark..."
3,0,1967121891,Fri May 29 19:00:46 PDT 2009,NO_QUERY,sweetsheilx,Just woke up and i feel relieved Haha now i ha...
4,4,1695846172,Mon May 04 07:04:29 PDT 2009,NO_QUERY,monmariej,LOVING the hot weather forecast for the rest o...
...,...,...,...,...,...,...
29995,4,1989611723,Mon Jun 01 02:10:20 PDT 2009,NO_QUERY,danjwhite1,"@AaronRenfree be strong, You can do it! x"
29996,0,2055276562,Sat Jun 06 09:01:28 PDT 2009,NO_QUERY,BethanyMcFlyX,@JessMcFlyxxx haha you ok? my tv has gone bloo...
29997,0,2207484837,Wed Jun 17 08:00:01 PDT 2009,NO_QUERY,nicphillips83,Torrential rain + Take That fans traffic = bad...
29998,4,2013485600,Tue Jun 02 22:47:33 PDT 2009,NO_QUERY,ghalya,"feels loved... ahhhhhhhhhhh &amp; now, i will ..."


Checking column's properties

In [28]:
df.dtypes

polarity     int64
id           int64
date        object
query       object
user        object
text        object
dtype: object

In [29]:
print(df["polarity"].unique())
print(df["query"].unique())
print(df["user"].unique().shape)

[0 4]
['NO_QUERY']
(27770,)


In [30]:
useless_cols = ["id", "date", "query", "user"]
df = delete_columns(df, useless_cols)

In [31]:
df.text = df['text'].apply(preprocess_tweet_text)

In [32]:
df

Unnamed: 0,polarity,text
0,0,arm still hurts pulled yesterday
1,4,I much outside Been looking 4 years wno energy...
2,0,Yes I knew clusterfark Its way since right PhD...
3,0,Just woke feel relieved Haha go work 2 hours
4,4,LOVING hot weather forecast rest week Summer a...
...,...,...
29995,4,strong You x
29996,0,haha ok tv gone bloody gay boo sucks xx
29997,0,Torrential rain Take That fans traffic bad fir...
29998,4,feels loved ahhhhhhhhhhh amp sleep angels


Preparing data for ML models

In [14]:

vector = TfidfVectorizer(sublinear_tf=True)
vector.fit(np.array(df.iloc[:,1]).ravel())
X = vector.transform(np.array(df.iloc[:,1]).ravel())
y = np.array(df.iloc[:, 0]).ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

Logistics Regression model

In [19]:
LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)
y_predict_lr = LR_model.predict(X_test)

Naive Bayes model

In [20]:
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
y_predict_nb = NB_model.predict(X_test)

Results comparison

In [22]:
print("Logistics Regression accuracy:", accuracy_score(y_test, y_predict_lr))
print("Naive Bayes:", accuracy_score(y_test, y_predict_nb))

Logistics Regression accuracy: 0.7510666666666667
Naive Bayes: 0.7373333333333333
