# Data mining 
## Final Project
## Μπριάκος Σπυρίδων 1115201700101

### Import python libraries, obtain .csv files and save 'cleaned' train,test dataframes

In [0]:
import os
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import operator
import string
import re
from wordcloud import STOPWORDS

#### Obtain .csv files which we are going to use them.

In [297]:
from google.colab import drive
drive.mount('/content/drive/')

train_df = pd.read_csv(r'/content/drive/My Drive/data/train.csv')
test_df = pd.read_csv(r'/content/drive/My Drive/data/impermium_verification_labels.csv') 

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


#### Convertion to lower cases and removal of links.

In [0]:
#Lower cases and removal of punctuations...
for i, row in train_df.iterrows():
    train_df.loc[i,'Comment']  =  train_df.loc[i,'Comment'].lower()
    train_df.loc[i,'Comment'] = ''.join(ch for ch in train_df['Comment'][i] if ch not in set(string.punctuation))
for i, row in test_df.iterrows():
    test_df.loc[i,'Comment']  =  test_df.loc[i,'Comment'].lower()
    test_df.loc[i,'Comment'] = ''.join(ch for ch in test_df['Comment'][i] if ch not in set(string.punctuation))

#Declaration of function remove_tags, which will help us remove html tags.
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
    return TAG_RE.sub('', text)
    
#Removal of html tags...
train_df['Comment'] = train_df['Comment'].apply(lambda x: remove_tags(x))
test_df['Comment'] = test_df['Comment'].apply(lambda x: remove_tags(x))
    
#Removal of links...
train_df['Comment'] = train_df['Comment'].replace(r'http\S+', '', regex=True).replace(r'www.\S+', '', regex=True).replace(r'"', '', regex=True)
test_df['Comment'] = test_df['Comment'].replace(r'http\S+', '', regex=True).replace(r'www.\S+', '', regex=True).replace(r'"', '', regex=True)

#Removal of symbols...
train_df['Comment'] = train_df['Comment'].replace(r'\\\S+', '', regex=True).replace(r'/', '', regex=True)
test_df['Comment'] = test_df['Comment'].replace(r'\\\S+', '', regex=True).replace(r'/', '', regex=True)

# train_df['Comment'].dropna(inplace=True)
# test_df['Comment'].dropna(inplace=True)
# train_df['Comment'] = train_df['Comment'].astype('U')
# test_df['Comment'] = test_df['Comment'].astype('U')
#Store cleaned train and test dataframes.
train_df.to_csv(r'/content/drive/My Drive/cleaned_train.csv', index=False)
test_df.to_csv(r'/content/drive/My Drive/cleaned_test.csv', index=False)

### Naive Bayes Scores (first try)

#### Vectorization with BoW

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

#Using simple CountVectorizer.
count_vectorizer = CountVectorizer()
bowX_train = count_vectorizer.fit_transform(train_df['Comment'].tolist()) 
bowX_test = count_vectorizer.transform(test_df['Comment'].tolist()) 

#Forming labels, which are going to help us with prediction and evaluation.
label_train = train_df['Insult'].tolist()
label_test = test_df['Insult'].tolist()

#### First Scores of Naive Bayes, only with Bag_of_Words

In [300]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score,accuracy_score

#Using Gaussian Naive Bayes Classifier and train data, then predict and evaluate score on test data.
gnb1 = GaussianNB()
gnb1.fit(bowX_train.toarray(), label_train)
predictions = gnb1.predict(bowX_test.toarray())

#F1 
first_nb_f1_score = "{:.2f}".format(f1_score(label_test,predictions,average='weighted'))
print('F1 score of Bag-of-words with Naive-Bayes Classifier: ' , first_nb_f1_score)

#Accuracy
first_nb_acc_score = "{:.2f}".format(accuracy_score(label_test,predictions))
print('Accuracy score of Bag-of-words with Naive-Bayes Classifier: ' , first_nb_acc_score)

F1 score of Bag-of-words with Naive-Bayes Classifier:  0.53
Accuracy score of Bag-of-words with Naive-Bayes Classifier:  0.53


### Optimization of Naive Bayes with lemmatization

#### Obtain cleaned train,test dataframes, lemmatization and then Vectorization with BoW









In [301]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('wordnet')

train_df = pd.read_csv(r'/content/drive/My Drive/cleaned_train.csv')
test_df = pd.read_csv(r'/content/drive/My Drive/cleaned_test.csv') 

# Lemmatization on data
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

#Functions, first lemmatize a word and second just concatenate strings of a list to a simple string.
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w, pos="v") for w in w_tokenizer.tokenize(text)]
def concat(text):
    concat_text = " ".join(i for i in text)
    return concat_text

train_df['Comment'] = train_df['Comment'].astype(str)
train_df['Comment'] = train_df['Comment'].apply(lemmatize_text).copy()
train_df['Comment'] = train_df['Comment'].apply(lambda x: concat(x))
test_df['Comment'] = test_df['Comment'].astype(str)
test_df['Comment'] = test_df['Comment'].apply(lemmatize_text).copy()
test_df['Comment'] = test_df['Comment'].apply(lambda x: concat(x))

#Using simple CountVectorizer.
count_vectorizer = CountVectorizer()
bowX_train = count_vectorizer.fit_transform(train_df['Comment'].tolist()) 
bowX_test = count_vectorizer.transform(test_df['Comment'].tolist()) 

#Forming labels, which are going to help us with prediction and evaluation.
label_train = train_df['Insult'].tolist()
label_test = test_df['Insult'].tolist()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


##### Naive Bayes scores with lemmatization


In [302]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score,accuracy_score

#Using Naive Bayes Classifier and train data, then predict and evaluate score on test data.
nb = GaussianNB() 
nb.fit(bowX_train.toarray(), label_train)
predictions = nb.predict(bowX_test.toarray())

#F1 
nb_f1_lemma_score = "{:.2f}".format(f1_score(label_test,predictions,average='weighted'))
print('F1 score of Bag-of-words (lemmatization) with Naive-Bayes Classifier: ' , nb_f1_lemma_score)

#Accuracy
nb_acc_lemma_score = "{:.2f}".format(accuracy_score(label_test,predictions))
print('Accuracy score of Bag-of-words (lemmatization) with Naive-Bayes Classifier: ' , nb_acc_lemma_score)

F1 score of Bag-of-words (lemmatization) with Naive-Bayes Classifier:  0.51
Accuracy score of Bag-of-words (lemmatization) with Naive-Bayes Classifier:  0.52


#### Obtain cleaned train,test dataframes, stopwords and then Vectorization with BoW

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

train_df = pd.read_csv(r'/content/drive/My Drive/cleaned_train.csv')
test_df = pd.read_csv(r'/content/drive/My Drive/cleaned_test.csv') 

train_df['Comment'] = train_df['Comment'].astype('U')
test_df['Comment'] = test_df['Comment'].astype('U')

#Using simple CountVectorizer.
count_vectorizer = CountVectorizer(stop_words='english')
bowX_train = count_vectorizer.fit_transform(train_df['Comment'].tolist()) 
bowX_test = count_vectorizer.transform(test_df['Comment'].tolist()) 

#Forming labels, which are going to help us with prediction and evaluation.
label_train = train_df['Insult'].tolist()
label_test = test_df['Insult'].tolist()

##### Naive Bayes scores with removal of stopwords

In [304]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score,accuracy_score

#Using Naive Bayes Classifier and train data, then predict and evaluate score on test data.
nb = GaussianNB() 
nb.fit(bowX_train.toarray(), label_train)
predictions = nb.predict(bowX_test.toarray())

#F1 
nb_f1_stop_score = "{:.2f}".format(f1_score(label_test,predictions,average='weighted'))
print('F1 score of Bag-of-words (stopwords) with Naive-Bayes Classifier: ' , nb_f1_stop_score)

#Accuracy
nb_acc_stop_score = "{:.2f}".format(accuracy_score(label_test,predictions))
print('Accuracy score of Bag-of-words (stopwords) with Naive-Bayes Classifier: ' , nb_acc_stop_score)

F1 score of Bag-of-words (stopwords) with Naive-Bayes Classifier:  0.53
Accuracy score of Bag-of-words (stopwords) with Naive-Bayes Classifier:  0.53


#### Obtain cleaned train,test dataframes, bigrams and then Vectorization with BoW

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

train_df = pd.read_csv(r'/content/drive/My Drive/cleaned_train.csv')
test_df = pd.read_csv(r'/content/drive/My Drive/cleaned_test.csv') 

train_df['Comment'] = train_df['Comment'].astype('U')
test_df['Comment'] = test_df['Comment'].astype('U')

#Using simple CountVectorizer.
count_vectorizer = CountVectorizer(ngram_range=(2,2))
bowX_train = count_vectorizer.fit_transform(train_df['Comment'].tolist()) 
bowX_test = count_vectorizer.transform(test_df['Comment'].tolist()) 

#Forming labels, which are going to help us with prediction and evaluation.
label_train = train_df['Insult'].tolist()
label_test = test_df['Insult'].tolist()

##### Naive Bayes scores with bigrams

In [306]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score,accuracy_score

#Using Naive Bayes Classifier and train data, then predict and evaluate score on test data.
nb = GaussianNB() 
nb.fit(bowX_train.toarray(), label_train)
predictions = nb.predict(bowX_test.toarray())

#F1 
nb_f1_bigram_score = "{:.2f}".format(f1_score(label_test,predictions,average='weighted'))
print('F1 score of Bag-of-words (bigrams) with Naive-Bayes Classifier: ' , nb_f1_bigram_score)

#Accuracy
nb_acc_bigram_score = "{:.2f}".format(accuracy_score(label_test,predictions))
print('Accuracy score of Bag-of-words (bigrams) with Naive-Bayes Classifier: ' , nb_acc_bigram_score)

F1 score of Bag-of-words (bigrams) with Naive-Bayes Classifier:  0.56
Accuracy score of Bag-of-words (bigrams) with Naive-Bayes Classifier:  0.56


#### Obtain cleaned train,test dataframes, Laplace Smoothing and then Vectorization with BoW

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

train_df = pd.read_csv(r'/content/drive/My Drive/cleaned_train.csv')
test_df = pd.read_csv(r'/content/drive/My Drive/cleaned_test.csv') 

train_df['Comment'] = train_df['Comment'].astype('U')
test_df['Comment'] = test_df['Comment'].astype('U')

#Using simple CountVectorizer.
count_vectorizer = CountVectorizer()
bowX_train = count_vectorizer.fit_transform(train_df['Comment'].tolist()) 
bowX_test = count_vectorizer.transform(test_df['Comment'].tolist()) 

#Forming labels, which are going to help us with prediction and evaluation.
label_train = train_df['Insult'].tolist()
label_test = test_df['Insult'].tolist()

##### Naive Bayes scores with Laplace Smoothing

In [308]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score,accuracy_score

#Using Multinomial Naive Bayes Classifier and train data, then predict and evaluate score on test data.
mnb = MultinomialNB() #default alpha=1.0, which enables Laplace Smoothing.
mnb.fit(bowX_train.toarray(), label_train)
predictions = mnb.predict(bowX_test.toarray())

#F1 
nb_f1_laplace_score = "{:.2f}".format(f1_score(label_test,predictions,average='weighted'))
print('F1 score of Bag-of-words with Multinomial Naive-Bayes Classifier: ' , nb_f1_laplace_score)

#Accuracy
nb_acc_laplace_score = "{:.2f}".format(accuracy_score(label_test,predictions))
print('Accuracy score of Bag-of-words) with Multinomial Naive-Bayes Classifier: ' , nb_acc_laplace_score)

F1 score of Bag-of-words with Multinomial Naive-Bayes Classifier:  0.67
Accuracy score of Bag-of-words) with Multinomial Naive-Bayes Classifier:  0.68


### TF-IDF based characteristics

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_df = pd.read_csv(r'/content/drive/My Drive/cleaned_train.csv')
test_df = pd.read_csv(r'/content/drive/My Drive/cleaned_test.csv') 

train_df['Comment'] = train_df['Comment'].astype('U')
test_df['Comment'] = test_df['Comment'].astype('U')

#Convert to vectors with TF-IDF Vectorizer...
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidfX_train = tfidf_vectorizer.fit_transform(train_df['Comment'].tolist())  
tfidfX_test = tfidf_vectorizer.transform(test_df['Comment'].tolist()) 

#Store to arrays the results of TF-TDF Vectorizer and later we'll add 4 frequencies (adverbs,verbs,adjectives,nouns)
train_characteristics_array = tfidfX_train.toarray()
test_characteristics_array = tfidfX_test.toarray()

### Part-Of-Speech based characteristics

In [310]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

#Create two empty lists which we'll fill with info in order to have 2 complex arrays (one for train data and one for test data.)
final_train_array = []
final_test_array = []

#Loop through each comment...
for i, row in train_df.iterrows():
    adverbs = 0.00
    verbs = 0.00
    adjectives = 0.00
    nouns = 0.00
    pos_list=[]
    #Tokenize each word of each comment ant then with pos_tag from nltk keep in a list info of kind of each word.
    pos_list = nltk.word_tokenize(train_df.loc[i,'Comment'])
    pos_list = nltk.pos_tag(pos_list)
    
    #Keep info for number of adverbs,verbs,adjectives,nouns in a comment.
    for tuple_ in pos_list:
        if tuple_[1]=="RB" or tuple_[1]=="RBR" or tuple_[1]=="RBS":
            adverbs = adverbs + 1
        if tuple_[1]=="VBP" or tuple_[1]=="VB" or tuple_[1]=="VBN" or tuple_[1]=="VBG" or tuple_[1]=="VBD" or tuple_[1]=="VBZ":
            verbs = verbs + 1
        if tuple_[1]=="JJ" or tuple_[1]=="JJR" or tuple_[1]=="JJS":
            adjectives = adjectives + 1
        if tuple_[1]=="NN" or tuple_[1]=="NNS":
            nouns = nouns + 1            
    #If we have info calculate percentages of adverbs,verbs,adjectives,nouns based on the sum of words into a comment.
    if len(pos_list)!=0:
        adverbs = adverbs/len(pos_list)
        verbs = verbs/len(pos_list)   
        adjectives = adjectives/len(pos_list)  
        nouns = nouns/len(pos_list) 

    #Store in a temporary list characteristics of TF-IDF and 4 percentages at the end of each list (about adverbs,verbs,adjectives,nouns) and then 
    #append this temp_list into the final 'complex' list.
    temp_list = []
    temp_list = train_characteristics_array[i].tolist()
    temp_list.append(adverbs)
    temp_list.append(verbs)
    temp_list.append(adjectives)
    temp_list.append(nouns)
    final_train_array.append(temp_list)

#Here we are doing the same process as before, but now for test data...
for i, row in test_df.iterrows():
    adverbs = 0.00
    verbs = 0.00
    adjectives = 0.00
    nouns = 0.00
    pos_list=[]
    pos_list = nltk.word_tokenize(test_df.loc[i,'Comment'])
    pos_list = nltk.pos_tag(pos_list)
    
    for tuple_ in pos_list:
        if tuple_[1]=="RB" or tuple_[1]=="RBR" or tuple_[1]=="RBS":
            adverbs = adverbs + 1
        if tuple_[1]=="VBP" or tuple_[1]=="VB" or tuple_[1]=="VBN" or tuple_[1]=="VBG" or tuple_[1]=="VBD" or tuple_[1]=="VBZ":
            verbs = verbs + 1
        if tuple_[1]=="JJ" or tuple_[1]=="JJR" or tuple_[1]=="JJS":
            adjectives = adjectives + 1
        if tuple_[1]=="NN" or tuple_[1]=="NNS":
            nouns = nouns + 1            
    
    if len(pos_list)!=0:
        adverbs = adverbs/len(pos_list)
        verbs = verbs/len(pos_list)    
        adjectives = adjectives/len(pos_list)
        nouns = nouns/len(pos_list)
    
    temp_list = []
    temp_list = test_characteristics_array[i].tolist()
    temp_list.append(adverbs)
    temp_list.append(verbs)
    temp_list.append(adjectives)
    temp_list.append(nouns)
    final_test_array.append(temp_list)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### TFIDF+POS complex array

In [311]:
#Convert lists of lists to np.arrays, cause we are going to use them later (on SVM & Random Forest classifier)
train_complex_array = np.array(final_train_array)
test_complex_array = np.array(final_test_array)

#Print some useful infos so as to prove that we have added infos about percentages of adverbs,verbs,adjectives,nouns.
print("TF-IDF train's shape: ", train_characteristics_array.shape)
print("Final train complex array's shape: ", train_complex_array.shape)
print("TF-IDF test's shape:", test_characteristics_array.shape)
print("Final complex test array's shape: ", test_complex_array.shape)

TF-IDF train's shape:  (3947, 1000)
Final train complex array's shape:  (3947, 1004)
TF-IDF test's shape: (2235, 1000)
Final complex test array's shape:  (2235, 1004)


### Now we are going to try SVM and Random Forest classifiers with complex arrays that we have just created.

#### 1) SVM Classifier 

In [312]:
from sklearn import svm 
from sklearn.metrics import f1_score,accuracy_score

#Using SVM Classifier and train data, then predict and evaluate score on test data.
svm_clf = svm.SVC()
svm_clf.fit(train_complex_array, label_train)
predictions = svm_clf.predict(test_complex_array)

#F1 
first_svm_f1_score = "{:.2f}".format(f1_score(label_test,predictions,average='weighted'))
print('F1 score of TF-IDF and POS based features with SVM Classifier: ' , first_svm_f1_score)

#Accuracy
first_svm_acc_score = "{:.2f}".format(accuracy_score(label_test,predictions))
print('Accuracy score of TF-IDF and POS based features with SVM Classifier: ' , first_svm_acc_score)

F1 score of TF-IDF and POS based features with SVM Classifier:  0.64
Accuracy score of TF-IDF and POS based features with SVM Classifier:  0.67


#### 2) Random Forest Classifier

In [313]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,accuracy_score

#Using Random Forest Classifier and train data, then predict and evaluate score on test data.
rf = RandomForestClassifier()
rf.fit(train_complex_array, label_train)
predictions = rf.predict(test_complex_array)

#F1 
first_rf_f1_score = "{:.2f}".format(f1_score(label_test,predictions,average='weighted'))
print('F1 score of TF-IDF and POS based features with Random Forest Classifier: ' , first_rf_f1_score)

#Accuracy
first_rf_acc_score = "{:.2f}".format(accuracy_score(label_test,predictions))
print('Accuracy score of TF-IDF and POS based features with Random Forest Classifier: ' , first_rf_acc_score)

F1 score of TF-IDF and POS based features with Random Forest Classifier:  0.60
Accuracy score of TF-IDF and POS based features with Random Forest Classifier:  0.64



### Beat the Benchmark

#### Lemmatization again on clean_data (train & test)

In [314]:
import nltk
nltk.download('wordnet')

train_df = pd.read_csv(r'/content/drive/My Drive/cleaned_train.csv')
test_df = pd.read_csv(r'/content/drive/My Drive/cleaned_test.csv') 

# Lemmatization on data
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

#Functions, first lemmatize a word and second just concatenate strings of a list to a simple string.
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w, pos="v") for w in w_tokenizer.tokenize(text)]
def concat(text):
    concat_text = " ".join(i for i in text)
    return concat_text

train_df['Comment'] = train_df['Comment'].astype(str)
train_df['Comment'] = train_df['Comment'].apply(lemmatize_text).copy()
train_df['Comment'] = train_df['Comment'].apply(lambda x: concat(x))
test_df['Comment'] = test_df['Comment'].astype(str)
test_df['Comment'] = test_df['Comment'].apply(lemmatize_text).copy()
test_df['Comment'] = test_df['Comment'].apply(lambda x: concat(x))

#Forming labels, which are going to help us with prediction and evaluation.
label_train = train_df['Insult'].tolist()
label_test = test_df['Insult'].tolist()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### TFIDF Vectorizer and added info about pronouns of each comment 

In [315]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm 
from sklearn.metrics import f1_score,accuracy_score

#Convert to vectors with TF-IDF Vectorizer...
tfidf_vectorizer = TfidfVectorizer(max_features=1700)
tfidfX_train = tfidf_vectorizer.fit_transform(train_df['Comment'].tolist())  
tfidfX_test = tfidf_vectorizer.transform(test_df['Comment'].tolist()) 

#Convert results of TFIDF Vectorizer to arrays.
train_characteristics_array = tfidfX_train.toarray()
test_characteristics_array = tfidfX_test.toarray()

#Create two empty lists which we'll fill with info in order to have 2 complex arrays (one for train data and one for test data.)
final_train_array = []
final_test_array = []

#Loop through each comment...
for i, row in train_df.iterrows():
    pronouns=0
    pos_list=[]
    
    pos_list = nltk.word_tokenize(train_df.loc[i,'Comment'])
    pos_list1 = nltk.pos_tag(pos_list)
    for tuple_ in pos_list1:
      if tuple_[1]=="PRP" or tuple_[1]=="PRP$":
          pronouns = pronouns + 1  
    
    #Store in a temporary list characteristics of TF-IDF and number of pronouns of each list.
    temp_list = []
    temp_list = train_characteristics_array[i].tolist()
    temp_list.append(pronouns)
    final_train_array.append(temp_list)

#Here we are doing the same process as before, but now for test data...
for i, row in test_df.iterrows():
    pronouns = 0
    pos_list=[]
    
    pos_list = nltk.word_tokenize(test_df.loc[i,'Comment'])
    pos_list1 = nltk.pos_tag(pos_list)
    #For each comment find how many pronouns exist and store it to pronouns, so as to add it as an extra feature.
    for tuple_ in pos_list1:
      if tuple_[1]=="PRP" or tuple_[1]=="PRP$":
          pronouns = pronouns + 1
    
    temp_list = []
    temp_list = test_characteristics_array[i].tolist()
    temp_list.append(pronouns)
    final_test_array.append(temp_list)


train_addedinfo_array = np.array(final_train_array)
test_addedinfo_array = np.array(final_test_array)

#Using SVM Classifier and train data, then predict and evaluate score on test data.
svm_clf = svm.SVC(kernel='rbf',C=1690,gamma=0.001)
svm_clf.fit(train_addedinfo_array, label_train)
predictions = svm_clf.predict(test_addedinfo_array)

second_svm_f1_score = "{:.2f}".format(f1_score(label_test,predictions),average='weighted')
print('F1 score of TF-IDF and pronoun-added feature with SVM Classifier: ' , second_svm_f1_score)
second_svm_acc_score = "{:.2f}".format(accuracy_score(label_test,predictions))
print('Accuracy score of TF-IDF and pronoun-added feature with SVM Classifier: ' , second_svm_acc_score)

F1 score of TF-IDF and pronoun-added feature with SVM Classifier:  0.64
Accuracy score of TF-IDF and pronoun-added feature with SVM Classifier:  0.71


### Results & Conclusions

#### Results & Conclusions about Naive Bayes

In [320]:
print("Scorer                 F1    Accuracy")
print("BOW                   ",first_nb_f1_score,"",first_nb_acc_score)
print("BOW+Lemmatization:    ",nb_f1_lemma_score,"",nb_acc_lemma_score)
print("BOW+StopWords:        ",nb_f1_stop_score,"",nb_acc_stop_score)
print("BOW+Bigrams:          ",nb_f1_bigram_score,"",nb_acc_bigram_score)
print("BOW+Laplace Smoothing:",nb_f1_laplace_score,"",nb_acc_laplace_score)

Scorer                 F1    Accuracy
BOW                    0.53  0.53
BOW+Lemmatization:     0.51  0.52
BOW+StopWords:         0.53  0.53
BOW+Bigrams:           0.56  0.56
BOW+Laplace Smoothing: 0.67  0.68


#####  


---


* Firstly we have to refer that our dataset (train & test data) is made up of Comments, which through thorough observation of them I concluded that a quite large portion of Comments 
uses oral language (like slang or cant). 
* So this unique whim of our dataset makes our job extremely difficult, cause there are a lot of words which are completely uknown to our models and some of them they are used rarely 
(such as: "How old are u 12? \n\nKnicks fan **BEEYATCH**!!!\n\nNot a Melo ball licker!!!").
). 


---



1) Our NB model has the **worst** score when we applied, to our dataset, lemmatization and the explanation of this performance is that lemmatization 
maybe is affected in a small scale by this unique whim of our dataset.


---
2) Our NB model has the **best** score when we applied, to our dataset, MultinomialNB (alpha=1.0), which practically means that we applied Laplace smoothing. The reason of this good score compared with previous techniques is that has the value alpha=1.0 and this practically means that probability of each word cannot be zero, so with that way we regularize Naive Bayes. This helps us cause words that are used rarely would have probability zero in simple Naive Bayes and this would have a bad impact in the prediction of each comment, but with Laplace Smoothing we are 'giving the chance' to words that are used very commonly to play the most significant role and thus these predict more correct Comments.


---
3) Furthermore, stopwords seems to have the same behavior if with the case of not removing them and this stands by, because maybe model is balancing between of common useful words such as you,your which are thrown away and common completely unuseful words such as and,the.


---





#### Results & Conclusions about SVM,Random Forest on TFIDF+PartOfSpeech array


In [317]:
print("Scorer                           F1    Accuracy")
print("TFIDF+PartOfSpeech+RandomForest:",first_rf_f1_score,"",first_rf_acc_score)
print("TFIDF+PartOfSpeech+SVM:         ",first_svm_f1_score,"",first_svm_acc_score)

Scorer                           F1    Accuracy
TFIDF+PartOfSpeech+RandomForest: 0.60  0.64
TFIDF+PartOfSpeech+SVM:          0.64  0.67


##### As we can observe... 
* SVM classifier was better than Random Forest classifier and so does generally accuracy score compared with f1 score.

* The reason of this optimization is not only from TFIDF Vectorizer, which worked better than BoW and POS tags which give more info about each comment, but also from SVM and Random Forest Classifier which were more efficient on our dataset.

#### Results & Conclusions about Beat the Benchhmark


In [318]:
print("Scorer                        F1    Accuracy")
print("TFIDF+Lemmatization+Pronouns:",second_svm_f1_score,"",second_svm_acc_score)

Scorer                        F1    Accuracy
TFIDF+Lemmatization+Pronouns: 0.64  0.71


---
#####SVM classifier with accuracy score has the **best** score in this notebook and made a quite small optimization (0.67->0.71). 

* To achive this score i applied lemmatization on our dataset and TFIDF Vectorization (which obviously gave us better scores than BoW).

*  Additionally, I calculate number of pronouns for each Comment and added this significant quantity in the end of TFIDF vectorized array. My initiative was made cause I searched on Internet and I found that this dataset is from a specific contest of KAGGLE and in important notes it was emphasized: 


---


1) We are looking for comments that are intended to be insulting to a person who is a part of the larger blog/forum conversation. 

2) We are NOT looking for insults directed to non-participants (such as celebrities, public figures etc.). 

3) Insults could contain profanity, racial slurs, or other offensive language. But often times, they do not. 

4) Comments which contain profanity or racial slurs, but are not necessarily insulting to another person are considered not insulting." 



---
* So for this reason I concluded that,because many labeled as insult Comments have a lot of pronouns, it will play significant role to our final prediction.


---


For example this Comment is assumed to be non-insult:

"fuck the judges..better stop the boxing events!"				

For example this Comment is assumed to be insult:

"You're a fucking joke."




---



* Except of all that I referred right before, one extra factor for my improvement in new score is undoubtly parameters of SVM classifier, which I find through after many attempts with random parameters.

##### Note: In all of our attempts we have done a 'small' cleaning. To be more specific we removed html tags,urls,punctuations and symbols.