In [30]:
import sys
import re
import nltk
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud,STOPWORDS
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import (KNeighborsClassifier,
                               NeighborhoodComponentsAnalysis)

[nltk_data] Downloading package stopwords to C:\Users\PRINCE
[nltk_data]     SINGH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loading the dataset

In [31]:
df = pd.read_csv('Twitter.csv')
df.head(20)

Unnamed: 0,id,label,tweet
0,1,1,@user when a father is disfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,1,@user that was fucking weird
3,4,1,@userthat was so shitty
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...
7,8,0,the next school year is the year for exams. ca...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...
9,10,1,you are so boring


In [32]:
train_pos = df[df['label'] == 0]
train_neg = df[df['label'] == 1]

# Removal of all non-alphabetic letters

In [33]:
def clean_tweet_words(tweet):
    alpha_only = re.sub("[^a-zA-Z]",' ',tweet) #"[^a-zA-Z]" this regex will remove any non-alphabetical char as they are not significant
    words = alpha_only.lower().split()
    stop = set(stopwords.words('english'))
    #from the dataframe we can see 'user' word is quite common in the tweets, which is basically used for tagging someone in the tweet
    #so I will be removing that
    stop.add('user')
    sig_words = [word for word in words if not word in stop]
    return(" ".join(sig_words))

In [34]:
df['clean_tweet']  = df['tweet'].apply(lambda tweet: clean_tweet_words(tweet))

In [35]:
df.drop('tweet',axis=1,inplace=True)

In [36]:
df.head(10)

Unnamed: 0,id,label,clean_tweet
0,1,1,father disfunctional selfish drags kids disfun...
1,2,0,thanks lyft credit use cause offer wheelchair ...
2,3,1,fucking weird
3,4,1,userthat shitty
4,5,0,factsguide society motivation
5,6,0,huge fan fare big talking leave chaos pay disp...
6,7,0,camping tomorrow danny
7,8,0,next school year year exams think school exams...
8,9,0,love land allin cavs champions cleveland cleve...
9,10,1,boring


# TERM FREQUENCY

In [37]:
freq = pd.Series(' '.join(df['clean_tweet']).split()).value_counts()[:10]  ##couting common word
freq

love     2828
day      2393
amp      1777
happy    1707
u        1193
like     1180
life     1176
time     1149
today    1095
new      1003
dtype: int64

In [38]:
freq = list(freq.index)
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq)) ##common word removal
#df['clean_tweet'].head()

In [39]:
freq = pd.Series(' '.join(df['clean_tweet']).split()).value_counts()[:10]  ##couting common word
freq

thankful    952
positive    937
get         923
people      885
good        874
bihday      873
one         798
see         764
smile       747
go          668
dtype: int64

In [40]:
freq = pd.Series(' '.join(df['clean_tweet']).split()).value_counts()[-10:] ##finding rare words
#freq 

# REMOVING RARE WORDS

In [41]:
freq = list(freq.index)
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))  ## rare words removal
df['clean_tweet'].head()

0    father disfunctional selfish drags kids disfun...
1    thanks lyft credit use cause offer wheelchair ...
2                                        fucking weird
3                                      userthat shitty
4                        factsguide society motivation
Name: clean_tweet, dtype: object

# TOKENIZATION

In [42]:
tokenized_tweets=df['clean_tweet'].apply(lambda x:x.split())
tokenized_tweets.head()

0    [father, disfunctional, selfish, drags, kids, ...
1    [thanks, lyft, credit, use, cause, offer, whee...
2                                     [fucking, weird]
3                                   [userthat, shitty]
4                    [factsguide, society, motivation]
Name: clean_tweet, dtype: object

In [43]:
stemmer=PorterStemmer()


# STEMMING 

In [44]:
tokenized_tweets=tokenized_tweets.apply(lambda x:[stemmer.stem(i) for i in x])

In [45]:
tokenized_tweets.head()

0    [father, disfunct, selfish, drag, kid, disfunc...
1    [thank, lyft, credit, use, caus, offer, wheelc...
2                                        [fuck, weird]
3                                   [userthat, shitti]
4                          [factsguid, societi, motiv]
Name: clean_tweet, dtype: object

In [46]:
for i in range(len(tokenized_tweets)):
    tokenized_tweets[i]=' '.join(tokenized_tweets[i])
    
df['clean_tweet']=tokenized_tweets
#df.head(20)

In [47]:
Y=df['label']
X=df['clean_tweet']

# SPLITTING OF DATA

In [48]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.5,random_state=0)
X_train.shape,X_test.shape,Y_train.shape,Y_test.shape

((15981,), (15981,), (15981,), (15981,))

# ALGORITHM IMPLEMENTATION


In [49]:
svc_pipe = Pipeline([('tfidf',TfidfVectorizer()),('svc', LinearSVC(random_state=0,max_iter=5000))])
nb_pipe = Pipeline([('tfidf',TfidfVectorizer()),('nb', MultinomialNB())])
k_n_n_=svc_pipe

In [50]:
svc_pipe.fit(X_train,Y_train)
pred_svc = svc_pipe.predict(X_test)

In [51]:
nb_pipe.fit(X_train,Y_train)
pred_nb = nb_pipe.predict(X_test)

In [52]:
print('SVC')
print(accuracy_score(Y_train,pred_svc))
print('\n')
print(confusion_matrix(Y_train,pred_svc))
print('\n')

SVC
0.8902446655403291


[[14180   649]
 [ 1105    47]]




In [53]:
print('Naive Bayes Classifier')
print(accuracy_score(Y_train,pred_nb))
print('\n')
print(confusion_matrix(Y_train,pred_nb))
print('\n')

Naive Bayes Classifier
0.9210937988861774


[[14711   118]
 [ 1143     9]]




In [54]:
pred_nb = svc_pipe.predict(X_test)

In [55]:
print('SVC')
print(accuracy_score(Y_test,pred_svc))
print('\n')
print(confusion_matrix(Y_test,pred_svc))
print('\n')

SVC
0.9620173956573431


[[14781   103]
 [  504   593]]




In [56]:
pred_nb = nb_pipe.predict(X_test)
print('Naive Bayes Classifier')
print(accuracy_score(Y_test,pred_nb))
print('\n')
print(confusion_matrix(Y_test,pred_nb))
print('\n')

Naive Bayes Classifier
0.9391777736061573


[[14883     1]
 [  971   126]]




In [57]:
from sklearn.neighbors import KNeighborsClassifier
k_n_n= KNeighborsClassifier(n_neighbors=2)

In [58]:
k_n_n.fit(X_train,Y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('svc',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [59]:
pred_knn = k_n_n.predict(X_test)

In [63]:
print('KNN')
print(accuracy_score(Y_train,pred_knn))
print('\n')
print('Confusion Matrix')
print(confusion_matrix(Y_train,pred_knn))

KNN
0.8902446655403291


Confusion Matrix
[[14180   649]
 [ 1105    47]]
