# Quora Text Classification

## Imports

In [1]:
import pandas as pd
import numpy as np
import string
import re 
import os

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Train-test split and cross validation
from sklearn.model_selection import train_test_split, ParameterGrid
# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB as mnb


In [3]:

from urllib.request import urlretrieve as ur
import pandas as pd
import jovian as jn
import matplotlib.pyplot as plt
import numpy as np
import plotly.subplots
import plotly.graph_objs as go

<IPython.core.display.Javascript object>

In [4]:
# NLP
import string, re, nltk
from string import punctuation
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

from num2words import num2words
import nltk
from spellchecker import SpellChecker
from nltk.stem.porter import PorterStemmer
import spacy


from nltk.stem import WordNetLemmatizer

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Scipy
import scipy
from scipy import sparse
from scipy.sparse import csr_matrix

In [5]:
# Model evaluation
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Others
import json
import gensim
from sklearn.decomposition import TruncatedSVD

In [6]:
#! kaggle competitions download -c quora-insincere-questions-classification


In [7]:


# Get the current working directory
currentDir  = os.getcwd()
print(currentDir)

/Users/siddharthpangotra/Documents/GitHub/sid4ML/4ML/ClassificationProblems/QuoraTextClassification


In [8]:
#train_df = pd.read_csv('/Data/quora-insincere-questions-classification/train.csv')
train_df = pd.read_csv('/Users/siddharthpangotra/Documents/GitHub/sid4ML/4ML/ClassificationProblems/QuoraTextClassification/Data/quora-insincere-questions-classification/train.csv')

In [9]:
train_df.shape

(1306122, 3)

## Target class value counts
<ul> 0 - Sincere Question </ul>
<ul> 1 - Insincere Question


In [10]:
train_df.target.value_counts()

target
0    1225312
1      80810
Name: count, dtype: int64

In [11]:
train_df.head(10)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
5,00004f9a462a357c33be,"Is Gaza slowly becoming Auschwitz, Dachau or T...",0
6,00005059a06ee19e11ad,Why does Quora automatically ban conservative ...,0
7,0000559f875832745e2e,Is it crazy if I wash or wipe my groceries off...,0
8,00005bd3426b2d0c8305,"Is there such a thing as dressing moderately, ...",0
9,00006e6928c5df60eacb,Is it just me or have you ever been in this ph...,0


Verify the duplicate records, drop if any

In [12]:
print("Total records",train_df.shape[0],"\tUnique records",len(train_df.question_text.unique()))

Total records 1306122 	Unique records 1306122


## Splitting the data into
<ul>1. Train</ul>
<ul>2. Test</ul>
<ul>3. Validation</ul>

In [13]:
o_data=train_df.reset_index(drop=True)
# Feature-target split
X, y = o_data.drop('target', axis = 1), o_data['target']

# Train-test split (from complete data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
data_train = pd.concat([X_train, y_train], axis = 1)


# Validation-test split (from test data)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 42)
data_val, data_test = pd.concat([X_val, y_val], axis = 1), pd.concat([X_test, y_test], axis = 1)

# Comparison of sizes of training set, validation set and test set
values = np.array([len(data_train), len(data_val), len(data_test)])
labels = ['Training set', 'Validation Set', 'Test set']
fig = go.Figure(data = [go.Pie(values = values, labels = labels, hole = 0.5, textinfo = 'value', title = " ")])
text_title = "Comparison of sizes of training set, validation set and test set"
fig.update_layout(height = 500, width = 800, showlegend = True, title = dict(text = text_title, x = 0.5, y = 0.95)) 
fig.show()

Train VS Test VS Val data distribution

In [14]:
# Create a smaller sample of training data
small_o_data=data_train.reset_index(drop=True)
# Feature-target split
X, y = small_o_data.drop('target', axis = 1), small_o_data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8, random_state = 42)
small_data_train = pd.concat([X_train, y_train], axis = 1)

small_data_train.shape

(208979, 3)

In [15]:
print("Train Data Distribution:",y_train.value_counts(),"\n Test Data Distribution:",y_test.value_counts(),"\n Validation Data:",y_val.value_counts())

Train Data Distribution: target
0    195924
1     13055
Name: count, dtype: int64 
 Test Data Distribution: target
0    784019
1     51899
Name: count, dtype: int64 
 Validation Data: target
0    122631
1      7981
Name: count, dtype: int64


# Text Normalization
<br>Conversion to Lowercase
<br>Removal of Whitespaces
<br>Removal of Punctuations
<br>Removal of Unicode Characters
<br>Substitution of Acronyms
<br>Substitution of Contractions
<br>Removal of Stop Words
<br>Spelling Correction
<br>Stemming and Lemmatization
<br>Discardment of Non-alphabetic Words
<br>Retainment of Relevant Parts of Speech
<br>Removal of Additional Stop Words
<br>Integration of the Processes
<br>Implementation on Product Description

Conversion to Lowercase

In [16]:
def convertToLower(text):
    return str(text).lower()

Removal of Whitespaces

In [17]:
# Removing whitespaces
def remove_whitespace(text):
    return text.strip()

text = " \t This is a string \t "
print("Input: {}".format(text))
print("Output: {}".format(remove_whitespace(text)))

Input:  	 This is a string 	 
Output: This is a string


Removal of Punctuations

In [18]:
# Removing punctuations
def remove_punctuation(text):
    punct_str = string.punctuation
    punct_str = punct_str.replace("'", "") # discarding apostrophe from the string to keep the contractions intact
    return text.translate(str.maketrans("", "", punct_str))

In [19]:
text = "Here's [an] example? {of} &a string. with.? her's punctuations!!!!"

In [20]:
print("Output: {}".format(remove_punctuation(text)))

Output: Here's an example of a string with her's punctuations


Removal of Html tags

In [21]:
# Removing HTML tags
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

text = '<a href = "https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification"> Ecommerce Text Classification </a>'
print("Input: {}".format(text))
print("Output: {}".format(remove_html(text)))

Input: <a href = "https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification"> Ecommerce Text Classification </a>
Output:  Ecommerce Text Classification 


Removal of Unicode Characters

In [22]:
# Removing emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags = re.UNICODE)
    return emoji_pattern.sub(r'', text)

text = "This innovative hd printing technique results in durable and spectacular looking prints 😊"
print("Input: {}".format(text))
print("Output: {}".format(remove_emoji(text)))

Input: This innovative hd printing technique results in durable and spectacular looking prints 😊
Output: This innovative hd printing technique results in durable and spectacular looking prints 


In [23]:
# Removing other unicode characters
def remove_http(text):
    http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http")
    pattern = r"({})".format(http) # creating pattern
    return re.sub(pattern, "", text)

text = "It's a function that removes links starting with http: or https such as https://en.wikipedia.org/wiki/Unicode_symbols"
print("Input: {}".format(text))
print("Output: {}".format(remove_http(text)))

Input: It's a function that removes links starting with http: or https such as https://en.wikipedia.org/wiki/Unicode_symbols
Output: It's a function that removes links starting with http: or https such as 


Substitution of Acronyms

In [24]:

# Dictionary of acronyms
#acronyms_url = 'https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_acronyms.json'json 
acronyms_dict = pd.read_json(  currentDir+'/Data/acrony_file.json', typ = 'series',  )



print("Example: Original form of the acronym 'fyi' is '{}'".format(acronyms_dict['fyi']))

Example: Original form of the acronym 'fyi' is 'for your information'


In [25]:
# Dataframe of acronyms
pd.DataFrame(acronyms_dict.items(), columns = ['acronym', 'original']).head()

Unnamed: 0,acronym,original
0,aka,also known as
1,asap,as soon as possible
2,brb,be right back
3,btw,by the way
4,dob,date of birth


In [26]:
# List of acronyms
acronyms_list = list(acronyms_dict.keys())

In [27]:
# RegexpTokenizer
regexp = RegexpTokenizer("[\w']+")

Substitution of Contractions

In [28]:
# Function to convert contractions in a text
def convert_acronyms(text):
    words = []
    for word in regexp.tokenize(text):
        if word in acronyms_list:
            words = words + acronyms_dict[word].split()
        else:
            words = words + word.split()
    
    text_converted = " ".join(words)
    return text_converted

text = "btw you've to fill in the details including dob"
print("Input: {}".format(text))
print("Output: {}".format(convert_acronyms(text)))

Input: btw you've to fill in the details including dob
Output: by the way you've to fill in the details including date of birth


In [29]:
# Dictionary of contractions
contractions_url = 'https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_contractions.json'
contractions_dict = pd.read_json(currentDir+'/Data/collection_file.json', typ = 'series')

print("Example: Original form of the contraction 'aren't' is '{}'".format(contractions_dict["aren't"]))

Example: Original form of the contraction 'aren't' is 'are not'


In [30]:
# Dataframe of contractions
pd.DataFrame(contractions_dict.items(), columns = ['contraction', 'original']).head()

Unnamed: 0,contraction,original
0,'aight,alright
1,ain't,are not
2,amn't,am not
3,arencha,are not you
4,aren't,are not


In [31]:
# List of contractions
contractions_list = list(contractions_dict.keys())
# Function to convert contractions in a text
def convert_contractions(text):
    words = []
    for word in regexp.tokenize(text):
        if word in contractions_list:
            words = words + contractions_dict[word].split()
        else:
            words = words + word.split()
    
    text_converted = " ".join(words)
    return text_converted

text = "he's doin' fine"
print("Input: {}".format(text))
print("Output: {}".format(convert_contractions(text)))

Input: he's doin' fine
Output: he is doing fine


Stopwords

In [32]:
# Stopwords
stops = stopwords.words("english") # stopwords
addstops = ["among", "onto", "shall", "thrice", "thus", "twice", "unto", "us", "would"] # additional stopwords
allstops = stops + addstops

print(allstops)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [33]:
# Function to remove stopwords from a list of texts
def remove_stopwords(text):
    return " ".join([word for word in regexp.tokenize(text) if word not in allstops])

text = "This is a function that removes stopwords in a given text"
print("Input: {}".format(text))
print("Output: {}".format(remove_stopwords(text)))

Input: This is a function that removes stopwords in a given text
Output: This function removes stopwords given text


SpellChecker

In [34]:
# pyspellchecker
spell = SpellChecker()

def pyspellchecker(text):
    word_list = regexp.tokenize(text)
    word_list_corrected = []
    for word in word_list:
        if word in spell.unknown(word_list):
            word_corrected = spell.correction(word)
            if word_corrected == None:
                word_list_corrected.append(word)
            else:
                word_list_corrected.append(word_corrected)
        else:
            word_list_corrected.append(word)
    text_corrected = " ".join(word_list_corrected)
    return text_corrected

text = "I'm a fixteen "
print("Input: {}".format(text))
print("Output: {}".format(pyspellchecker(text)))

Input: I'm a fixteen 
Output: I'm a fifteen


Stemming

In [35]:
# Stemming
stemmer = PorterStemmer()
def text_stemmer(text):
    text_stem = " ".join([stemmer.stem(word) for word in regexp.tokenize(text)])
    return text_stem

text = "Helping standardization and processing improvization"
print("Input: {}".format(text))
print("Output: {}".format(text_stemmer(text)))


Input: Helping standardization and processing improvization
Output: help standard and process improv


Lemmatization

In [36]:
# Lemmatization
spacy_lemmatizer = spacy.load("en_core_web_sm", disable = ['parser', 'ner'])
#lemmatizer = WordNetLemmatizer()

def text_lemmatizer(text):
    text_spacy = " ".join([token.lemma_ for token in spacy_lemmatizer(text)])
    #text_wordnet = " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(text)]) # regexp.tokenize(text)
    return text_spacy
    #return text_wordnet

text = "Helping standardization & processing improvisation"
print("Input: {}".format(text))
print("Output: {}".format(text_lemmatizer(text)))

Input: Helping standardization & processing improvisation
Output: help standardization & processing improvisation


Removal of non-alphabets

In [37]:
# Discardment of non-alphabetic words
def discard_non_alpha(text):
    word_list_non_alpha = [word for word in regexp.tokenize(text) if word.isalpha()]
    text_non_alpha = " ".join(word_list_non_alpha)
    return text_non_alpha

text = "It is an ocean of thousands and 1000s of crowd"
print("Input: {}".format(text))
print("Output: {}".format(discard_non_alpha(text)))

Input: It is an ocean of thousands and 1000s of crowd
Output: It is an ocean of thousands and of crowd


Part of Speech (POS)

In [38]:
def keep_pos(text):
    tokens = regexp.tokenize(text)
    tokens_tagged = nltk.pos_tag(tokens)
    #keep_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
    keep_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'FW', 'PRP', 'PRPS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WPS', 'WRB']
    keep_words = [x[0] for x in tokens_tagged if x[1] in keep_tags]
    return " ".join(keep_words)

text = "He arrived at seven o'clock on Wednesday evening"
print("Input: {}".format(text))
tokens = regexp.tokenize(text)
print("Tokens: {}".format(tokens))
tokens_tagged = nltk.pos_tag(tokens)
print("Tagged Tokens: {}".format(tokens_tagged))
print("Output: {}".format(keep_pos(text)))

Input: He arrived at seven o'clock on Wednesday evening
Tokens: ['He', 'arrived', 'at', 'seven', "o'clock", 'on', 'Wednesday', 'evening']
Tagged Tokens: [('He', 'PRP'), ('arrived', 'VBD'), ('at', 'IN'), ('seven', 'CD'), ("o'clock", 'NN'), ('on', 'IN'), ('Wednesday', 'NNP'), ('evening', 'NN')]
Output: He arrived o'clock Wednesday evening


In [39]:
# Additional stopwords

alphabets = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
prepositions = ["about", "above", "across", "after", "against", "among", "around", "at", "before", "behind", "below", "beside", "between", "by", "down", "during", "for", "from", "in", "inside", "into", "near", "of", "off", "on", "out", "over", "through", "to", "toward", "under", "up", "with"]
prepositions_less_common = ["aboard", "along", "amid", "as", "beneath", "beyond", "but", "concerning", "considering", "despite", "except", "following", "like", "minus", "onto", "outside", "per", "plus", "regarding", "round", "since", "than", "till", "underneath", "unlike", "until", "upon", "versus", "via", "within", "without"]
coordinating_conjunctions = ["and", "but", "for", "nor", "or", "so", "and", "yet"]
correlative_conjunctions = ["both", "and", "either", "or", "neither", "nor", "not", "only", "but", "whether", "or"]
subordinating_conjunctions = ["after", "although", "as", "as if", "as long as", "as much as", "as soon as", "as though", "because", "before", "by the time", "even if", "even though", "if", "in order that", "in case", "in the event that", "lest", "now that", "once", "only", "only if", "provided that", "since", "so", "supposing", "that", "than", "though", "till", "unless", "until", "when", "whenever", "where", "whereas", "wherever", "whether or not", "while"]
others = ["ã", "å", "ì", "û", "ûªm", "ûó", "ûò", "ìñ", "ûªre", "ûªve", "ûª", "ûªs", "ûówe"]
additional_stops = alphabets + prepositions + prepositions_less_common + coordinating_conjunctions + correlative_conjunctions + subordinating_conjunctions + others

def remove_additional_stopwords(text):
    return " ".join([word for word in regexp.tokenize(text) if word not in additional_stops])

## Integrating the pre-processing steps

In [40]:
def text_normalizer(text):
    text = convertToLower(text)
    text = remove_whitespace(text)
    text = re.sub('\n' , '', text) # converting text to one line
    text = re.sub('\[.*?\]', '', text) # removing square brackets
    text = remove_http(text)
    text = remove_punctuation(text)
    text = remove_html(text)
    text = remove_emoji(text)
    text = convert_acronyms(text)
    text = convert_contractions(text)
    #text = remove_stopwords(text)
    text = pyspellchecker(text)
    text = text_lemmatizer(text) # text = text_stemmer(text)
    text = discard_non_alpha(text)
    text = keep_pos(text)
    #text = remove_additional_stopwords(text)
    return text

text = "We'll combine all functions into 1 SINGLE FUNCTION 🙂 & apply on @product #descriptions https://en.wikipedia.org/wiki/Text_normalization"
print("Input: {}".format(text))
print("Output: {}".format(text_normalizer(text)))

Input: We'll combine all functions into 1 SINGLE FUNCTION 🙂 & apply on @product #descriptions https://en.wikipedia.org/wiki/Text_normalization
Output: we combine function function apply product description


Apply pre-processing on Training data

In [41]:
print(data_train.head(10).question_text)

298773     How is strategic positioning is different from...
815475     What is the best way for promote Facebook mark...
1133453    How much energized proton radiation does the I...
1076426    Would any Indian men want to marry a women tha...
203792     Which is the best business for startups in Ind...
215745     Why do NIT students do overactions for everyth...
982650                             Why are orbits important?
509193     What should be my approach before talking to a...
893778     What are some differentiating characteristics ...
60109                   Should I pursue a degree in finance?
Name: question_text, dtype: object


In [42]:
normalized_text = data_train.head(10)['question_text'].apply(text_normalizer)

In [43]:
print (data_train.head(10).question_text.iloc[3]," => ",normalized_text.iloc[3], )

Would any Indian men want to marry a women that doesn't want children?  =>  man want marry woman that do not want child


In [44]:
small_data_train.shape

(208979, 3)

Normalizing the Train,Test and Val data

In [45]:
%%time
# Implementing text normalization
data_train_norm, data_val_norm, data_test_norm = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

data_train.columns
data_train_norm['normalized description'] = small_data_train['question_text'].apply(text_normalizer)
data_val_norm['normalized description'] = data_val['question_text'].apply(text_normalizer)
data_test_norm['normalized description'] = data_test['question_text'].apply(text_normalizer)

data_train_norm['Category'] = data_train['target']
data_val_norm['Category'] = data_val['target']
data_test_norm['Category'] = data_test['target']

data_train_norm

KeyboardInterrupt: 

## TF-IDF

In [None]:
data_train_norm['Category'] = small_data_train['target']
data_val_norm['Category'] = data_val['target']
data_test_norm['Category'] = data_test['target']

Splitting the noramlized data

In [None]:
# Features and labels
X_train_norm, y_train = data_train_norm['normalized description'].tolist(), data_train_norm['Category'].tolist()
X_val_norm, y_val = data_val_norm['normalized description'].tolist(), data_val_norm['Category'].tolist()
X_test_norm, y_test = data_test_norm['normalized description'].tolist(), data_test_norm['Category'].tolist()

In [None]:
y_train

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,


In [None]:
# TF-IDF vectorization
TfidfVec = TfidfVectorizer(ngram_range = (1, 1))
X_train_tfidf = TfidfVec.fit_transform(X_train_norm)
X_val_tfidf = TfidfVec.transform(X_val_norm)
X_test_tfidf = TfidfVec.transform(X_test_norm)

In [None]:
# Classifiers
names = [
    #"Logistic Regression",
    # "KNN Classifier",
    # "Decision Tree",
    # "Linear SVM",
    "Random Forest",
    # "SGD Classifier",
    # "Ridge Classifier",
    # "XGBoost",
    # "AdaBoost",
]

models = [
    #LogisticRegression(max_iter = 100),
    # KNeighborsClassifier(n_neighbors = 149, n_jobs = -1),
    # DecisionTreeClassifier(),
    # svm.SVC(kernel = 'linear'),
    RandomForestClassifier(n_estimators = 100),
    # SGDClassifier(loss = 'hinge'),
    # RidgeClassifier(),
    # XGBClassifier(),
    # AdaBoostClassifier(),
    
]

In [None]:
# Function to return summary of baseline models
import time


def score(X_train, y_train, X_val, y_val, names = names, models = models):
    score_df, score_train, score_val, score_conf = pd.DataFrame(), [], [],[]
    x = time.time()
    for model in models:
        model.fit(X_train, y_train)
        y_train_pred, y_val_pred = model.predict(X_train), model.predict(X_val)
        score
        score_conf.append(confusion_matrix(y_train,y_train_pred))
        score_train.append(accuracy_score(y_train, y_train_pred))

        score_val.append(accuracy_score(y_val, y_val_pred))
    
    score_df["Classifier"], score_df["Training accuracy"], score_df["Validation accuracy"], score_df["Confusion_matrix"] = names, score_train, score_val, score_conf
    score_df.sort_values(by = 'Validation accuracy', ascending = False, inplace = True)
    return score_df

In [None]:
# Summary of baseline models
#score(X_train_tfidf, y_train, X_val_tfidf, y_val, names = names, models = models)

# Machine learning Models

## Naive Bayes


In [None]:

mnbClassfier = mnb()
mnbClassfier.fit(X_train_tfidf,y_train)
y_train_pred_mnb = mnbClassfier.predict(X_train_tfidf)


In [None]:
y_val_pred_mnb = mnbClassfier.predict(X_val_tfidf)
y_test_pred_mnb =  mnbClassfier.predict(X_test_tfidf)

In [None]:
#result = score(X_train_tfidf, y_train, X_val_tfidf, y_val, names = ['Naive Bayes'], models = [mnb()])

## Random Forest

In [None]:
rndFor = RandomForestClassifier(n_estimators = 100)
rndFor.fit(X_train_tfidf,y_train)
y_train_pred_rnd = mnbClassfier.predict(X_train_tfidf)
y_val_pred_rnd = rndFor.predict(X_val_tfidf)
y_test_pred_rnd =  rndFor.predict(X_test_tfidf)

## F1 Score

### MNB

In [None]:
from sklearn.metrics import f1_score

# Calculate F1 score
f1 = f1_score(y_train, y_train_pred_mnb)
f1_val = f1_score(y_val,y_val_pred_mnb)
f1_test = f1_score(y_test,y_test_pred_mnb)
print ("Train",f1,"Val",f1_val,"test", f1_test)

Train 0.07114449845338047 Val 0.02679778733866011 test 0.028428927680798004


### RndFor

In [None]:
f1_rnd = f1_score(y_train, y_train_pred_rnd)
f1_val_rnd = f1_score(y_val,y_val_pred_rnd)
f1_test_rnd = f1_score(y_test,y_test_pred_rnd)
print ("Train",f1,"Val",f1_val_rnd,"test", f1_test_rnd)

Train 0.07114449845338047 Val 0.21996753246753248 test 0.2152015591342702


In [None]:
jn.commit(filename='QuoraCommentClassification.ipynb')