# Importing required libraries

In [41]:
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
import string
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,precision_score,precision_score

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Importing the data(csv file)

In [173]:
df=pd.read_csv('training.1600000.processed.noemoticon.csv',encoding='latin-1',low_memory=True)

In [174]:
df.head(5)

Unnamed: 0,polarity of tweet,id of the tweet,date of the tweet,query,user,text of the tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [175]:
df.drop(columns=['id of the tweet','query','user','date of the tweet'],inplace=True)

In [176]:
df.head(5)

Unnamed: 0,polarity of tweet,text of the tweet
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


# Data Cleaning and Preprocessing

In [177]:
df.shape

(1048572, 2)

In [178]:
df.size

2097144

In [179]:
df.isnull().sum()

polarity of tweet     0
text of the tweet     0
dtype: int64

In [180]:
df.isna().sum()

polarity of tweet     0
text of the tweet     0
dtype: int64

In [181]:
df.duplicated()

0          False
1          False
2          False
3          False
4          False
           ...  
1048567    False
1048568    False
1048569    False
1048570    False
1048571    False
Length: 1048572, dtype: bool

In [182]:
df.drop_duplicates(inplace=True)

In [183]:
df.shape

(1036908, 2)

In [184]:
df.columns=['target','text']

In [185]:
df['target'].value_counts()

0    790181
4    246727
Name: target, dtype: int64

In [187]:
import re

In [188]:
ps=PorterStemmer()

In [189]:
stop_words=nltk.corpus.stopwords.words('english')

In [190]:
def clean_text1(text):
    text="".join([c for c in text if c not in string.punctuation])
    tokens=re.split('\W+', text)
    text=[word for word in tokens if word not in stop_words]
    return text

In [191]:
df["text1"] = df["text"].apply(lambda x: clean_text1(x.lower()))

In [192]:
df.head(5)

Unnamed: 0,target,text,text1
0,0,is upset that he can't update his Facebook by ...,"[upset, cant, update, facebook, texting, might..."
1,0,@Kenichan I dived many times for the ball. Man...,"[kenichan, dived, many, times, ball, managed, ..."
2,0,my whole body feels itchy and like its on fire,"[whole, body, feels, itchy, like, fire, ]"
3,0,"@nationwideclass no, it's not behaving at all....","[nationwideclass, behaving, im, mad, cant, see, ]"
4,0,@Kwesidei not the whole crew,"[kwesidei, whole, crew, ]"


In [193]:
def stemming(tokenized_word):
    text=[ps.stem(word) for word in tokenized_word]
    return text

In [194]:
df['text1']=df['text1'].apply(lambda x: stemming(x))

In [195]:
df.sample(15)

Unnamed: 0,target,text,text1
91277,0,they have wifi in the terminal. Otherwise I wo...,"[wifi, termin, otherwis, wont, tweet, late, tu..."
202259,0,@JaymeFoxx sounds like a great saturday. I can...,"[jaymefoxx, sound, like, great, saturday, cant..."
590614,0,@IAMJREAL hey real i wont be able to make it s...,"[iamjreal, hey, real, wont, abl, make, sat, du..."
578998,0,I feel a lot better! I was sleeping most of to...,"[feel, lot, better, sleep, today, bad, miss, f..."
1019007,4,@everydaystrange And - ahem - there they are. ...,"[everydaystrang, ahem, wow, awesom, ]"
570126,0,I dont get this,"[dont, get, ]"
900327,4,Morning.. Coffee and fresh air,"[morn, coffe, fresh, air, ]"
336005,0,@emilyawilliams your powers out? icky!,"[emilyawilliam, power, icki, ]"
832915,4,Gaga in Europe and Bizarres in America.. o_O w...,"[gaga, europ, bizarr, america, oo, strang, wor..."
580049,0,I can't even get the OSX Java update to D/L,"[cant, even, get, osx, java, updat, dl, ]"


In [196]:
df.drop(columns=['text'],inplace=True)

In [197]:
df

Unnamed: 0,target,text1
0,0,"[upset, cant, updat, facebook, text, might, cr..."
1,0,"[kenichan, dive, mani, time, ball, manag, save..."
2,0,"[whole, bodi, feel, itchi, like, fire, ]"
3,0,"[nationwideclass, behav, im, mad, cant, see, ]"
4,0,"[kwesidei, whole, crew, ]"
...,...,...
1048567,4,"[grandma, make, dinenr, mum, ]"
1048568,4,"[midmorn, snack, time, bowl, chees, noodl, yum, ]"
1048569,4,"[shadela, say, like, termini, movi, come, like..."
1048570,4,"[destinyhope92, im, great, thaank, wbuu]"


In [198]:
df['text2']=df['text1'].apply(lambda x: ' '.join(x))

In [199]:
df

Unnamed: 0,target,text1,text2
0,0,"[upset, cant, updat, facebook, text, might, cr...",upset cant updat facebook text might cri resul...
1,0,"[kenichan, dive, mani, time, ball, manag, save...",kenichan dive mani time ball manag save 50 res...
2,0,"[whole, bodi, feel, itchi, like, fire, ]",whole bodi feel itchi like fire
3,0,"[nationwideclass, behav, im, mad, cant, see, ]",nationwideclass behav im mad cant see
4,0,"[kwesidei, whole, crew, ]",kwesidei whole crew
...,...,...,...
1048567,4,"[grandma, make, dinenr, mum, ]",grandma make dinenr mum
1048568,4,"[midmorn, snack, time, bowl, chees, noodl, yum, ]",midmorn snack time bowl chees noodl yum
1048569,4,"[shadela, say, like, termini, movi, come, like...",shadela say like termini movi come like 3 word
1048570,4,"[destinyhope92, im, great, thaank, wbuu]",destinyhope92 im great thaank wbuu


In [200]:
df.drop(columns=['text1'],inplace=True)

# Model Building

In [201]:
from sklearn.feature_extraction.text import TfidfVectorizer
tdidf=TfidfVectorizer()

In [202]:
X=tdidf.fit_transform(df['text2'])

In [203]:
y=df['target']

In [204]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [205]:
model=LogisticRegression()

In [206]:
model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [207]:
y_pred=model.predict(X_test)

In [210]:
accuracy=accuracy_score(y_test,y_pred)
precision=precision_score(y_test,y_pred,average='weighted')

In [211]:
print('accuracy score: ',accuracy)
print('precision score: ',precision)

accuracy score:  0.8360272347648301
precision score:  0.8260856806179719


In [212]:
model.score(X_test,y_test)

0.8360272347648301