In [2]:
#importing necessary libraries
#tweet preprocesssor is a special library that can be used for cleaning texts
#the regular expression library or re deals with the special cases which the tweet preprocessor didn't have
import sklearn 
import numpy as np
import pandas as pd
!pip install tweet-preprocessor
import preprocessor as p
import re



Collecting tweet-preprocessor
  Downloading https://files.pythonhosted.org/packages/17/9d/71bd016a9edcef8860c607e531f30bd09b13103c7951ae73dd2bf174163c/tweet_preprocessor-0.6.0-py3-none-any.whl
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [3]:
#loading the dataset...the goal is to detect whether a given piece of text is humorous or not...from the dataset we have,1 means that the test is humorous
#and 0 means that the text is non-humorous
train=pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating
0,1,TENNESSEE: We're the best state. Nobody even c...,1,2.42,1.0,0.2
1,2,A man inserted an advertisement in the classif...,1,2.5,1.0,1.1
2,3,How many men does it take to open a can of bee...,1,1.95,0.0,2.4
3,4,Told my mom I hit 1200 Twitter followers. She ...,1,2.11,1.0,0.0
4,5,Roses are dead. Love is fake. Weddings are bas...,1,2.78,0.0,0.1


In [5]:
#These are the special cases which the tweet preprocessor doesn't have
REPLACE_NO_SPACE=re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
REPLACE_WITH_SPACE=re.compile("(<br\s/><br\s/?)|(-)|(/)|(:)")

In [6]:
#the clean_humor function is used to clean the dataset
def clean_humor(df):
  tempArr=[]
  for line in df:
    tmpL=p.clean(line)#here the tweet preprocessor has been used 
    tmpL=REPLACE_NO_SPACE.sub("",tmpL.lower())#here the regular expression library is used
    tmpL=REPLACE_WITH_SPACE.sub(" ",tmpL)#here the regular expression library is used
    tempArr.append(tmpL)
  return tempArr 

In [7]:
#Now that we have cleaned the data we are going to pass the tweets from the training data to the function
#And we'll convert it to a dataframe
train_humor=clean_humor(train["text"])
train_humor=pd.DataFrame(train_humor)  

In [9]:
#appending the clean tweet to the original dataset
train["clean_humor"]=train_humor
train.head()

Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating,clean_humor
0,1,TENNESSEE: We're the best state. Nobody even c...,1,2.42,1.0,0.2,tennessee were the best state nobody even come...
1,2,A man inserted an advertisement in the classif...,1,2.5,1.0,1.1,a man inserted an advertisement in the classif...
2,3,How many men does it take to open a can of bee...,1,1.95,0.0,2.4,how many men does it take to open a can of bee...
3,4,Told my mom I hit 1200 Twitter followers. She ...,1,2.11,1.0,0.0,told my mom i hit twitter followers she pointe...
4,5,Roses are dead. Love is fake. Weddings are bas...,1,2.78,0.0,0.1,roses are dead love is fake weddings are basic...


In [10]:
#Using train test split to split it into training and testing data...we have 70% training data and 30% testing data 
from sklearn.model_selection import train_test_split
y=train.is_humor.values
x_train,x_test,y_train,y_test=train_test_split(train.clean_humor.values,y,stratify=y,random_state=1,test_size=0.3,shuffle=True)

In [11]:
#We need to convert the text to numeric form as the model won't understand human language,so we'll vectorize the texts
#using CountVectorizer library...CountVectorizer library provides a simple way of tokenizing a collection of text documents
#The stopwords argument has been used to remove stop words like "a","the"
from sklearn.feature_extraction.text import  CountVectorizer
vectorizer=CountVectorizer(binary=True,stop_words='english')
vectorizer.fit(list(x_train)+list(x_test))
x_train_vectorization=vectorizer.transform(x_train)
x_test_vectorization=vectorizer.transform(x_test)

In [12]:
#We have used support vector classifier,it is a supervised learning algorithm used for binary or multiclass classification
from sklearn import svm
svm=svm.SVC(kernel='linear',probability=True)
prob=svm.fit(x_train_vectorization,y_train).predict_proba(x_test_vectorization)
y_pred_svm=svm.predict(x_test_vectorization)

In [14]:
from sklearn.metrics import accuracy_score,f1_score
print(accuracy_score(y_test,y_pred_svm)*100)
print(f1_score(y_test,y_pred_svm)*100)

78.625
82.7910097282791
