### Categorizing Tweets Using Natural Language Processing

In [21]:
import pandas as pd
import nltk 
import warnings
warnings.filterwarnings('ignore')

nltk.download('punkt')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer

import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vlekkala\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vlekkala\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
#!wget -O scocialmedia_relevant_cols.csv https://www.dropbox.com/s/rsdr3419xk9yean/socialmedia_relevant_cols.csv --no-check-certificate

In [7]:
df = pd.read_csv('socialmedia_relevant_cols.csv')
df.head()

Unnamed: 0,text,choose_one,class_label
0,Just happened a terrible car crash,Relevant,1
1,Our Deeds are the Reason of this #earthquake M...,Relevant,1
2,"Heard about #earthquake is different cities, s...",Relevant,1
3,"there is a forest fire at spot pond, geese are...",Relevant,1
4,Forest fire near La Ronge Sask. Canada,Relevant,1


In [8]:
df.shape

(10876, 3)

In [12]:
df.isna().sum()

text           0
choose_one     0
class_label    0
dtype: int64

In [14]:
df.class_label.value_counts()

0    6187
1    4673
2      16
Name: class_label, dtype: int64

In [18]:
import re
stop_words = stopwords.words('English')
lemmatizer = WordNetLemmatizer()

In [19]:
def clean_text(sentence):
    sentence = sentence.lower()
    words = word_tokenize(sentence)
    tokens = [lemmatizer.lemmatize(word) for word in words 
             if word not in stop_words 
              and word not in string.punctuation
             and re.match(r'^\w+$', word)]
    return " ".join(tokens)
    

In [20]:
df['cleaned_text'] = df['text'].apply(clean_text)
df.head()

Unnamed: 0,text,choose_one,class_label,cleaned_text
0,Just happened a terrible car crash,Relevant,1,happened terrible car crash
1,Our Deeds are the Reason of this #earthquake M...,Relevant,1,deed reason earthquake may allah forgive u
2,"Heard about #earthquake is different cities, s...",Relevant,1,heard earthquake different city stay safe ever...
3,"there is a forest fire at spot pond, geese are...",Relevant,1,forest fire spot pond goose fleeing across str...
4,Forest fire near La Ronge Sask. Canada,Relevant,1,forest fire near la ronge sask canada


In [23]:
tfidf = TfidfVectorizer(lowercase=True)
model = RandomForestClassifier()
X = df['cleaned_text']
y = df['class_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = Pipeline([('tfidf', tfidf), ('model', model)])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.89      0.83      1216
           1       0.83      0.69      0.75       957
           2       0.00      0.00      0.00         3

    accuracy                           0.80      2176
   macro avg       0.54      0.53      0.53      2176
weighted avg       0.80      0.80      0.80      2176



In [25]:
# Question 5: Analyze text data
# Find the size of vocabulary of the text data

In [28]:
complete = " ".join(df['cleaned_text'])
words = complete.split()
from collections import Counter

count_vocab = Counter(words)
count_vocab.most_common(20)

[('http', 6774),
 ('fire', 513),
 ('amp', 510),
 ('like', 494),
 ('get', 379),
 ('new', 327),
 ('via', 323),
 ('one', 291),
 ('u', 290),
 ('news', 288),
 ('people', 280),
 ('2', 237),
 ('video', 235),
 ('time', 227),
 ('would', 226),
 ('emergency', 224),
 ('disaster', 222),
 ('year', 210),
 ('body', 198),
 ('police', 196)]

In [36]:
df['tokens'] = df.cleaned_text.str.split()
df.head()

Unnamed: 0,text,choose_one,class_label,cleaned_text,tokens
0,Just happened a terrible car crash,Relevant,1,happened terrible car crash,"[happened, terrible, car, crash]"
1,Our Deeds are the Reason of this #earthquake M...,Relevant,1,deed reason earthquake may allah forgive u,"[deed, reason, earthquake, may, allah, forgive..."
2,"Heard about #earthquake is different cities, s...",Relevant,1,heard earthquake different city stay safe ever...,"[heard, earthquake, different, city, stay, saf..."
3,"there is a forest fire at spot pond, geese are...",Relevant,1,forest fire spot pond goose fleeing across str...,"[forest, fire, spot, pond, goose, fleeing, acr..."
4,Forest fire near La Ronge Sask. Canada,Relevant,1,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]"


In [29]:
# Question-6: Create word embeddings

In [37]:
from gensim.models import Word2Vec
model_vec_train = Word2Vec(sentences=df['tokens'], size=200, window=5, min_count=5, workers=-1, sg=1)
model_vec_train.save('word2vec.model')

In [38]:
model_vec_train['like'].shape

(200,)

In [33]:
# Question-7: Generate features using word embeddings for training and testing set

In [34]:
import numpy as np
def get_embeddings(sent_token, model=model_vec_train):
    vector = [model[word] if word in model else np.zeros(200) for word in sent_token]
    l = len(vector)
    s = np.sum(vector, axis=0)
    avg = s/l
    return avg

In [39]:
def generate_embeddings(data, model=model_vec_train):
    embeddings = data.apply(lambda x:get_embeddings(x, model))
    return embeddings

In [40]:
X_train, X_test, y_train, y_test = train_test_split(df['tokens'], df['class_label'], test_size=0.2, random_state=42)

embeddings_train = generate_embeddings(X_train)
embeddings_test = generate_embeddings(X_test)

In [41]:
# Converting the output word embeddings as dataframe

d = dict()
for i in range(200):
    l = []
    for j in range(8700):
        try:
            l.append(embeddings_train.values[j][i])
        except:
            l.append(0)
    d[i] = l
train = pd.DataFrame(d)

In [42]:
dic = dict()
for i in range(200):
    l = []
    for j in range(2176):
        try:
            l.append(embeddings_test.values[j][i])
        except:
            l.append(0)
    dic[i] = l
    
test = pd.DataFrame(dic)

In [43]:
train.shape

(8700, 200)

In [44]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-4.4e-05,0.000258,-0.000194,0.000438,0.000204,-0.000422,-0.000815,-0.000115,-0.000111,0.000314,...,0.000266,-0.000382,0.00015,0.000373,-0.000294,-0.000333,-0.000135,-0.000685,-0.000768,-0.000515
1,-5e-06,-0.000403,-7.1e-05,-0.000194,-0.000506,-0.000133,0.000197,0.000403,-4.7e-05,-0.000197,...,3e-05,-0.000418,-0.000172,0.000149,5e-05,-0.000334,0.000467,4.6e-05,-0.000393,-0.00026
2,0.000109,-0.000515,0.001282,0.000134,-0.000752,-0.001054,7.4e-05,-0.000614,0.000186,0.000203,...,8.5e-05,0.000609,-0.00029,-0.000635,-0.000786,-0.000548,0.000195,-0.000426,-0.000311,0.000693
3,-1.2e-05,6.4e-05,0.000403,-0.000411,-0.000518,-0.000534,0.000295,-0.000479,-0.000128,-0.000274,...,1.9e-05,0.000381,-0.000663,-0.000933,0.000528,-0.000297,-0.000523,-3.8e-05,0.001099,0.000435
4,-0.000258,-0.000652,0.000225,-0.001568,-0.000393,-0.000608,-0.000186,-0.00027,-0.000505,-0.000263,...,0.001069,0.000285,0.001194,-0.000225,-0.000287,-0.000284,0.000257,-0.000937,0.00093,0.000805


In [45]:
# Question-8: Perform K-Fold cross validation for model selection

In [46]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, random_state=7)
cross_val_sc = cross_val_score(model, train, y_train, scoring='accuracy', cv=kfold)
print('acc: {}(standard deviation: {}'.format(cross_val_sc.mean(), cross_val_sc.std()))

acc: 0.7108045977011495(standard deviation: 0.021455801860537926


In [48]:
model = RandomForestClassifier()
model.fit(train, y_train)
model.score(train, y_train)

0.9844827586206897