# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report



from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

# Loading Dataset

In [2]:
data=pd.read_csv("C:\\Users\\lenovo\\Downloads\\train.csv", encoding = 'latin-1')

In [3]:
data.head(10)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...
7,8,0,the next school year is the year for exams.Ã°Â...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...


In [4]:
data.describe()

Unnamed: 0,id,label
count,31962.0,31962.0
mean,15981.5,0.070146
std,9226.778988,0.255397
min,1.0,0.0
25%,7991.25,0.0
50%,15981.5,0.0
75%,23971.75,0.0
max,31962.0,1.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [6]:
data=data.drop(['id'],axis=1)

In [7]:
data['Count']=0
for i in np.arange(0,len(data.tweet)):
    data.loc[i,'Count'] = len(data.loc[i,'tweet'])

In [8]:
data.head(10)

Unnamed: 0,label,tweet,Count
0,0,@user when a father is dysfunctional and is s...,102
1,0,@user @user thanks for #lyft credit i can't us...,122
2,0,bihday your majesty,21
3,0,#model i love u take with u all the time in ...,118
4,0,factsguide: society now #motivation,39
5,0,[2/2] huge fan fare and big talking before the...,116
6,0,@user camping tomorrow @user @user @user @use...,77
7,0,the next school year is the year for exams.Ã°Â...,151
8,0,we won!!! love the land!!! #allin #cavs #champ...,90
9,0,@user @user welcome here ! i'm it's so #gr...,50


# Preparing WordVector Corpus

In [9]:
corpus = []

#  Using Porter Stemmer

In [10]:
ps=PorterStemmer()

In [11]:
for i in range(0, 5572):
    # Applying Regular Expression
    
    '''
    Replace email addresses with 'emailaddr'
    Replace URLs with 'httpaddr'
    Replace money symbols with 'moneysymb'
    Replace phone numbers with 'phonenumbr'
    Replace numbers with 'numbr'
    '''
    msg = data['tweet'][i]
    msg = re.sub('\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', data['tweet'][i])
    msg = re.sub('(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr', data['tweet'][i])
    msg = re.sub('£|\$', 'moneysymb', data['tweet'][i])
    msg = re.sub('\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', 'phonenumbr', data['tweet'][i])
    msg = re.sub('\d+(\.\d+)?', 'numbr', data['tweet'][i])
    
    ''' Remove all punctuations '''
    
    msg = re.sub('[^\w\d\s]', ' ', data['tweet'][i])
    
    if i<2:
        print("\t\t\t\t Tweet ", i)
    
    if i<2:
        print("\n After Regular Expression - tweet ", i, " : ", msg)
        
    # Each word to lower case
    msg = msg.lower()    
    if i<2:
        print("\n Lower case Tweet ", i, " : ", msg)
     # Splitting words to Tokenize   
    msg = msg.split()    
    if i<2:
        print("\n After Splitting - Tweet ", i, " : ", msg)
        
    # Stemming with PorterStemmer handling Stop Words
    msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))]
    if i<2:
        print("\n After Stemming - Tweet ", i, " : ", msg)
    # preparing Messages with Remaining Tokens
    msg = ' '.join(msg)
    if i<2:
        print("\n Final Prepared - Tweet ", i, " : ", msg, "\n\n")
     # Preparing WordVector Corpus   
    corpus.append(msg)

				 Tweet  0

 After Regular Expression - tweet  0  :    user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction     run

 Lower case Tweet  0  :    user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction     run

 After Splitting - Tweet  0  :  ['user', 'when', 'a', 'father', 'is', 'dysfunctional', 'and', 'is', 'so', 'selfish', 'he', 'drags', 'his', 'kids', 'into', 'his', 'dysfunction', 'run']

 After Stemming - Tweet  0  :  ['user', 'father', 'dysfunct', 'selfish', 'drag', 'kid', 'dysfunct', 'run']

 Final Prepared - Tweet  0  :  user father dysfunct selfish drag kid dysfunct run 


				 Tweet  1

 After Regular Expression - tweet  1  :   user  user thanks for  lyft credit i can t use cause they don t offer wheelchair vans in pdx      disapointed  getthanked

 Lower case Tweet  1  :   user  user thanks for  lyft credit i can t use cause they don t offer wheelchair vans in pdx      disapointed  getthanked

 Aft

# Preparing Vectors for each message

In [12]:
cv = CountVectorizer()
data_input = cv.fit_transform(corpus).toarray()

# Applying Classification 


In [13]:
data_output = data.iloc[0:5572,0:1]
print (data_output.value_counts())

label
0        5187
1         385
dtype: int64


# Splitting data for Training and Testing

In [14]:
train_x, test_x, train_y, test_y = train_test_split(data_input, data_output,test_size= 0.20, random_state = 0)

# Preparing ML Models

In [15]:
le = LabelEncoder()
data_output = le.fit_transform(data_output)

print(data_output[0])
print(data_output[1])

0
0


  return f(**kwargs)


# Training

In [16]:
bayes_classifier = GaussianNB()
bayes_classifier.fit(train_x, train_y)

  return f(**kwargs)


GaussianNB()

# Prediction

In [17]:
pred_y = bayes_classifier.predict(test_x)

In [18]:
cm = confusion_matrix(test_y, pred_y)
cm

array([[973,  58],
       [ 53,  31]], dtype=int64)

In [19]:
print ("Accuracy : %0.5f \n\n" % accuracy_score(test_y, bayes_classifier.predict(test_x)))
print (classification_report(test_y, bayes_classifier.predict(test_x)))

Accuracy : 0.90045 


              precision    recall  f1-score   support

           0       0.95      0.94      0.95      1031
           1       0.35      0.37      0.36        84

    accuracy                           0.90      1115
   macro avg       0.65      0.66      0.65      1115
weighted avg       0.90      0.90      0.90      1115

