In [24]:
import pandas as pd
from wordcloud import WordCloud
import re
from wordcloud import STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/final_dataset_basicmlmodel.csv')
dataset.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [5]:
for index, tweet in enumerate(dataset["tweet"][10:15]):
    print(index+1,".",tweet)

1 .  â #ireland consumer price index (mom) climbed from previous 0.2% to 0.5% in may   #blog #silver #gold #forex
2 . we are so selfish. #orlando #standwithorlando #pulseshooting #orlandoshooting #biggerproblems #selfish #heabreaking   #values #love #
3 . i get to see my daddy today!!   #80days #gettingfed
4 . ouch...junior is angryð#got7 #junior #yugyoem   #omg 
5 . i am thankful for having a paner. #thankful #positive     


In [9]:
def clean_text(text):
    #Filter to allow only alphabets
    text = re.sub(r'[^a-zA-Z\']', ' ', text)
    
    #Remove Unicode characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    #Convert to lowercase to maintain consistency
    text = text.lower()
       
    return text

In [11]:
dataset['clean_text'] = dataset.tweet.apply(lambda x: clean_text(x))
dataset

Unnamed: 0,id,label,tweet,clean_text
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i can't us...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation,factsguide society now motivation
...,...,...,...,...
5237,31935,1,lady banned from kentucky mall. @user #jcpenn...,lady banned from kentucky mall user jcpenn...
5238,31947,1,@user omfg i'm offended! i'm a mailbox and i'...,user omfg i'm offended i'm a mailbox and i'...
5239,31948,1,@user @user you don't have the balls to hashta...,user user you don't have the balls to hashta...
5240,31949,1,"makes you ask yourself, who am i? then am i a...",makes you ask yourself who am i then am i a...


In [13]:
print(STOPWORDS)

{'further', 'they', 'were', 'when', 'did', 'up', 'how', 'by', 'there', 'over', 'ours', 'its', 'and', 'most', 'having', 'more', 'cannot', "couldn't", 'not', 'own', 'would', 'ever', "you're", "there's", 'themselves', "i'll", 'like', 'he', "they've", "you've", 'both', 'than', 'being', 'some', "how's", 'only', 'as', "can't", 'however', 'your', "what's", 'on', "i'd", 'here', "hasn't", "mustn't", "we've", 'each', 'before', "you'd", 'is', "you'll", "she's", 'yourselves', 'same', "he'll", 'does', 'if', 'down', 'where', "she'd", 'was', "they're", 'that', 'can', 'no', 'after', 'you', 'below', "don't", 'get', "hadn't", "we're", 'during', "won't", "here's", 'into', "shan't", 'has', 'through', 'theirs', 'few', 'hence', 'himself', 'what', "aren't", "wasn't", "we'd", 'against', 'also', 'we', "weren't", 'should', 'she', 'his', 'this', "where's", 'our', 'therefore', 'her', 'to', "isn't", "when's", 'with', "they'd", "haven't", 'about', "he'd", 'it', 'be', 'such', 'why', 'yours', 'who', 'http', 'very', '

In [14]:
STOP_WORDS=(STOPWORDS)

In [15]:
def gen_freq(text):
    #Will store the list of words
    word_list = []
 
    #Loop over all the tweets and extract words into word_list
    for tw_words in text.split():
        word_list.extend(tw_words)
 
    #Create word frequencies using word_list
    word_freq = pd.Series(word_list).value_counts()
    
    #Drop the stopwords during the frequency calculation
    word_freq = word_freq.drop(STOP_WORDS, errors='ignore')
    
    return word_freq
 
#Check whether a negation term is present in the text
def any_neg(words):
    for word in words:
        if word in ['n', 'no', 'non', 'not'] or re.search(r"\wn't", word):
            return 1
    else:
        return 0
 
#Check whether one of the 100 rare words is present in the text
def any_rare(words, rare_100):
    for word in words:
        if word in rare_100:
            return 1
    else:
        return 0
 
#Check whether prompt words are present
def is_question(words):
    for word in words:
        if word in ['when', 'what', 'how', 'why', 'who']:
            return 1
    else:
        return 0

In [17]:
word_freq = gen_freq(dataset.clean_text.str)
#100 most rare words in the dataset
rare_100 = word_freq[-100:]
#Number of words in a tweet
dataset['word_count'] = dataset.clean_text.str.split().apply(lambda x: len(x))
#Negation present or not
dataset['any_neg'] = dataset.clean_text.str.split().apply(lambda x: any_neg(x))
#Prompt present or not
dataset['is_question'] = dataset.clean_text.str.split().apply(lambda x: is_question(x))
#Any of the most 100 rare words present or not
dataset['any_rare'] = dataset.clean_text.str.split().apply(lambda x: any_rare(x, rare_100))
#Character count of the tweet
dataset['char_count'] = dataset.clean_text.apply(lambda x: len(x))
dataset

Unnamed: 0,id,label,tweet,clean_text,word_count,any_neg,is_question,any_rare,char_count
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is s...,18,0,1,0,102
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i can't us...,19,1,0,0,122
2,3,0,bihday your majesty,bihday your majesty,3,0,0,0,21
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ...,12,0,0,0,86
4,5,0,factsguide: society now #motivation,factsguide society now motivation,4,0,0,0,39
...,...,...,...,...,...,...,...,...,...
5237,31935,1,lady banned from kentucky mall. @user #jcpenn...,lady banned from kentucky mall user jcpenn...,8,0,0,0,59
5238,31947,1,@user omfg i'm offended! i'm a mailbox and i'...,user omfg i'm offended i'm a mailbox and i'...,12,0,0,0,82
5239,31948,1,@user @user you don't have the balls to hashta...,user user you don't have the balls to hashta...,23,1,0,1,112
5240,31949,1,"makes you ask yourself, who am i? then am i a...",makes you ask yourself who am i then am i a...,17,0,1,0,87


In [18]:
gen_freq(dataset.clean_text.str)[:10]

user      3351
amp        439
love       320
day        254
trump      214
happy      207
will       191
people     186
new        171
u          158
dtype: int64

In [20]:
X = dataset[['word_count', 'any_neg', 'any_rare', 'char_count', 'is_question']]
y = dataset.label
 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=27)

In [23]:
model = GaussianNB()

model = model.fit(X_train, y_train)

pred = model.predict(X_test)

In [25]:
print("Accuracy:", accuracy_score(y_test, pred)*100, "%")

Accuracy: 44.0 %
