In [1]:
from google.colab import files
uploaded = files.upload()

Saving train.txt to train.txt


# Data Cleaning

In [2]:
import pandas as pd
import numpy as np

In [3]:
file=pd.read_csv('train.txt', sep="\t") 

In [4]:
file.head()

Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating
0,1,TENNESSEE: We're the best state. Nobody even c...,1,2.42,1.0,0.2
1,2,A man inserted an advertisement in the classif...,1,2.5,1.0,1.1
2,3,How many men does it take to open a can of bee...,1,1.95,0.0,2.4
3,4,Told my mom I hit 1200 Twitter followers. She ...,1,2.11,1.0,0.0
4,5,Roses are dead. Love is fake. Weddings are bas...,1,2.78,0.0,0.1


In [5]:
file=file.drop(columns='humor_rating')
file=file.drop(columns='humor_controversy')
file=file.drop(columns='offense_rating')
file=file.drop(columns='id')
print('File length',len(file))

File length 8000


In [6]:
from sklearn.preprocessing import LabelEncoder
file['labels']=LabelEncoder().fit_transform(file['is_humor'])
file=file.drop(columns='is_humor')
file

Unnamed: 0,text,labels
0,TENNESSEE: We're the best state. Nobody even c...,1
1,A man inserted an advertisement in the classif...,1
2,How many men does it take to open a can of bee...,1
3,Told my mom I hit 1200 Twitter followers. She ...,1
4,Roses are dead. Love is fake. Weddings are bas...,1
...,...,...
7995,Lack of awareness of the pervasiveness of raci...,0
7996,Why are aspirins white? Because they work sorry,1
7997,"Today, we Americans celebrate our independence...",1
7998,How to keep the flies off the bride at an Ital...,1


In [7]:
import collections
counter=collections.Counter(file['labels'])
print(counter)
#file['labels'].value_counts().plot(kind='bar')

Counter({1: 4932, 0: 3068})


# Data transformation

In [8]:
file['text'] = file['text'].values.tolist()
print(file['text'])

0       TENNESSEE: We're the best state. Nobody even c...
1       A man inserted an advertisement in the classif...
2       How many men does it take to open a can of bee...
3       Told my mom I hit 1200 Twitter followers. She ...
4       Roses are dead. Love is fake. Weddings are bas...
                              ...                        
7995    Lack of awareness of the pervasiveness of raci...
7996      Why are aspirins white? Because they work sorry
7997    Today, we Americans celebrate our independence...
7998    How to keep the flies off the bride at an Ital...
7999    "Each ounce of sunflower seeds gives you 37% o...
Name: text, Length: 8000, dtype: object


In [9]:
file['labels'] = file['labels'].values.tolist()
y=file['labels']


# Tokenization (p 183)

In [10]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(file['text'])
sequences = tokenizer.texts_to_sequences(file['text'])       #[[278, 1, 136, 560, 677, 148, 272, 561, 449, 91, 1, 346, 259, 408], [2, 80, 43, 8, 1, 55, 242, 1,
one_hot_encoded = tokenizer.texts_to_matrix(file['text'], mode='binary') # array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,  (1k values)
word_index = tokenizer.word_index                            #{'the': 1, 'a': 2, 'to': 3, 'i': 4, 'you': 5, 'and': 6, 'of': 7,

#Split train test

In [11]:
from sklearn.model_selection import train_test_split

xtrain, xremaining, ytrain, yremaining = train_test_split(one_hot_encoded,y, train_size=0.6)
xval, xtest, yval, ytest = train_test_split(xremaining, yremaining, test_size=0.5)
print('xtrain',len(xtrain), 'ytrain',len(ytrain),'xval', len(xval),'yval', len(yval),'xtest', len(xtest),'ytest', len(ytest))

xtrain 4800 ytrain 4800 xval 1600 yval 1600 xtest 1600 ytest 1600


- xtrain..ytrain 4800 4800
- xval.....yval...1600 1600
- xtest...ytest..1600 1600 

In [12]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(criterion='gini', 
                                       n_estimators=700, 
                                       min_samples_split=10,     
                                       min_samples_leaf=1,     
                                       max_features='auto',     
                                       oob_score=True,     
                                       random_state=1,    
                                       n_jobs=-1) 


In [13]:
random_forest.fit(xtrain, ytrain)

RandomForestClassifier(min_samples_split=10, n_estimators=700, n_jobs=-1,
                       oob_score=True, random_state=1)

In [14]:
#ypred = random_forest.predict(xtest) 

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
 
yhat_probs = random_forest.predict(xtest) 

# predict crisp classes for test set
yhat_classes = random_forest.predict(xtest)
yhat_classes = np.where(yhat_classes > 0.5, 1, 0)
print(yhat_classes)
 
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(ytest, yhat_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(ytest, yhat_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(ytest, yhat_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(ytest, yhat_classes)
print('F1 score: %f' % f1)
 
# confusion matrix
matrix = confusion_matrix(ytest, yhat_classes)
print(matrix)

[0 1 0 ... 1 1 1]
Accuracy: 0.817500
Precision: 0.813076
Recall: 0.908436
F1 score: 0.858115
[[425 203]
 [ 89 883]]
