In [43]:
# import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [44]:
# import libraries for nlp
import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RAHUL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [45]:
# REAd the dataset
data = pd.read_csv('dataset//train_E6oV3lV.csv')

In [46]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [47]:
data.shape

(31962, 3)

In [48]:
# prerprocessing function
def pre_process(tweet):
    '''
    tweet: tweet is a long string (i.e individual tweet)
    final_list = return list of words
    '''
    
    # lowercase
    tweet = tweet.lower()
    
    # remove punctuation
    tweet = "".join([char for char in tweet if char not in string.punctuation])
    
    # tokenization
    tweet_l = word_tokenize(tweet)
    
    # remove stop words
    stop_words = stopwords.words('english')
    tweet_l = [word for word in tweet_l if word not in stop_words]
    
    # stemming
    porter = PorterStemmer()
    final_list = [porter.stem(word) for word in tweet_l]
    
    return final_list

In [49]:
# Testing the above function
t = "This is me! Hey, It's rahul."
res = pre_process(t)
print(res)

['hey', 'rahul']


In [50]:
# preprocess all corpora
X = []
for tweet in data['tweet']:
    tweet = pre_process(tweet)
    X.append(tweet)

In [51]:
Y = data['label']

In [52]:
len(X)

31962

In [53]:
x = np.array(X)

In [54]:
vocab_l = []
for lst in x:
    vocab_l += lst
vocab_l

['user',
 'father',
 'dysfunct',
 'selfish',
 'drag',
 'kid',
 'dysfunct',
 'run',
 'user',
 'user',
 'thank',
 'lyft',
 'credit',
 'cant',
 'use',
 'caus',
 'dont',
 'offer',
 'wheelchair',
 'van',
 'pdx',
 'disapoint',
 'getthank',
 'bihday',
 'majesti',
 'model',
 'love',
 'u',
 'take',
 'u',
 'time',
 'urð\x9f\x93±',
 'ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91',
 'ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦',
 'factsguid',
 'societi',
 'motiv',
 '22',
 'huge',
 'fan',
 'fare',
 'big',
 'talk',
 'leav',
 'chao',
 'pay',
 'disput',
 'get',
 'allshowandnogo',
 'user',
 'camp',
 'tomorrow',
 'user',
 'user',
 'user',
 'user',
 'user',
 'user',
 'user',
 'dannyâ\x80¦',
 'next',
 'school',
 'year',
 'year',
 'examsð\x9f\x98¯',
 'cant',
 'think',
 'ð\x9f\x98\xad',
 'school',
 'exam',
 'hate',
 'imagin',
 'actorslif',
 'revolutionschool',
 'girl',
 'love',
 'land',
 'allin',
 'cav',
 'champion',
 'cleveland',
 'clevelandcavali',
 'â\x80¦',
 'user',
 'user',
 'welcom',
 'im',
 'gr8',
 'â\x86\x9d'

In [55]:
len(vocab_l)

281668

In [56]:
vocab = set(vocab_l)
len(vocab)

41367

In [57]:
empty_X = np.zeros([len(X), len(vocab)], dtype = 'int8')
df = pd.DataFrame(empty_X, columns=list(vocab))
df.shape

(31962, 41367)

In [58]:
def get_freqs(lst):
    '''
    lst: list type (each processed tweets)
    
    return
    dict: {word:freq}
    '''
    dict = {}
    for word in lst:
        if word in dict.keys():
            dict[word] += 1
        else:
            dict[word] = 1
            
    return dict    

In [59]:
# test above function implementation
print(get_freqs(pre_process(t)))

{'hey': 1, 'rahul': 1}


In [60]:
df.head()

Unnamed: 0,gunner,yourslef,jokeâ¦,latenighttv,hotti,follow4followâ¦,instapicinstagoodinstafashionâ¦,summerbodi,872,taeilthailand,...,jel,askgat,nutti,weakdonaldtrump,willamett,mybihdaybyclassm,hbdtahirgond,ð¯ð,steveâ,nonton
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
df.at[0, 'hotti']

0

In [62]:
for i in range(df.shape[0]):
    print("Processing {0} tweet".format(i))
    for l in X:
        temp_dict = get_freqs(l)
        for key in temp_dict:
            df.at[i,key] = temp_dict[key]
            
print("\nAll Done!")

Processing 0 tweet
Processing 1 tweet
Processing 2 tweet
Processing 3 tweet
Processing 4 tweet
Processing 5 tweet
Processing 6 tweet
Processing 7 tweet
Processing 8 tweet
Processing 9 tweet
Processing 10 tweet
Processing 11 tweet
Processing 12 tweet
Processing 13 tweet
Processing 14 tweet
Processing 15 tweet
Processing 16 tweet
Processing 17 tweet
Processing 18 tweet
Processing 19 tweet
Processing 20 tweet
Processing 21 tweet
Processing 22 tweet
Processing 23 tweet
Processing 24 tweet
Processing 25 tweet
Processing 26 tweet
Processing 27 tweet
Processing 28 tweet
Processing 29 tweet
Processing 30 tweet
Processing 31 tweet
Processing 32 tweet
Processing 33 tweet
Processing 34 tweet
Processing 35 tweet
Processing 36 tweet
Processing 37 tweet
Processing 38 tweet
Processing 39 tweet
Processing 40 tweet
Processing 41 tweet
Processing 42 tweet
Processing 43 tweet
Processing 44 tweet
Processing 45 tweet
Processing 46 tweet
Processing 47 tweet
Processing 48 tweet
Processing 49 tweet
Processing

KeyboardInterrupt: 

In [14]:
# split the data into train and test
from sklearn.model_selection import train_test_split
X_train, Y_train, X_test, Y_test = train_test_split(X, Y, test_size = 0.35, random_state = 143)