In [None]:
%cd 'drive/My Drive/Colab Notebooks/Project-Data'

/content/drive/My Drive/Colab Notebooks/Project-Data



# Setting up Libraries and Functions for PreProcessing


### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
from nltk.corpus import words, stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from gensim.models import KeyedVectors
from scipy.sparse import csr_matrix, hstack
import re, string
from tqdm import tqdm
import copy

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Reading Training and Testing data

In [None]:
traindata = pd.read_csv('train.csv')
testdata = pd.read_csv('test.csv')

### Reading Positive and Negative Word List

In [None]:
negative_wordlist_path = '../negative-words.txt'
positive_wordlist_path = '../positive-words.txt'

In [None]:
def get_wordlist(path):
  file = open(path)
  words = []
  
  for w in file.readlines():
    words.append(w.strip('\n'))

  return words

In [None]:
negative_words = get_wordlist(negative_wordlist_path)
positive_words = get_wordlist(positive_wordlist_path)
stop_words = stopwords.words('english')

# PreProcessing

In [None]:
lemmatizer = WordNetLemmatizer()
def refine_data(data, removestopw):
    corpus = []
    temp = []

    for question in data['question_text']:
        words = re.sub('[^\w\s]', ' ', question)
        words = words.lower()
        words = words.split()
        if removestopw == True:
            words = [lemmatizer.lemmatize(word) for word in words if not word in stop_words]
        else:
            temp = []
            for word in words:
                if (word not in stop_words):
                    temp.append(lemmatizer.lemmatize(word))
                else:
                    temp.append(word)
            words = temp
        
        words = ' '.join(words)
        corpus.append(words)

    return corpus

In [None]:
def bestThresshold(y_train,train_preds):
    tmp = [0,0,0] # idx, cur, max
    delta = 0
    for tmp[0] in tqdm(np.arange(0.1, 0.501, 0.01)):
        tmp[1] = f1_score(y_train, np.array(train_preds)>tmp[0])
        if tmp[1] > tmp[2]:
            delta = tmp[0]
            tmp[2] = tmp[1]

    print('\nMax occurs at :', delta)
    return tmp[2]

### Removing Contractions

In [None]:
traindata['punctuations'] = traindata['question_text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
testdata['punctuations'] = testdata['question_text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

Before removing the contractions, for the meta features, calculating the number of punctuations present in the corpus so that the information regarding number of puntuation marks in each question is not lost.

In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  
                       "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", 
                       "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", 
                       "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  
                       "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", 
                       "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not",
                       "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", 
                       "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", 
                       "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                       "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
                       "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", 
                       "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", 
                       "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", 
                       "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", 
                       "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", 
                       "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", 
                       "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", 
                       "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have",
                       "y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", 
                       "you're": "you are", "you've": "you have" }

In [None]:
def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

In [None]:
traindata['question_text'] = traindata['question_text'].apply(lambda x: clean_contractions(x, contraction_mapping))
testdata['question_text'] = testdata['question_text'].apply(lambda x: clean_contractions(x, contraction_mapping))

### Creating Meta Features

In [None]:
def create_meta_features(data):
    data['negative_words'] = data['question_text'].apply(lambda x: sum(x.count(w) for w in negative_words))
    data['positive_words'] = data['question_text'].apply(lambda x: sum(x.count(w) for w in positive_words))
    data['unique_words'] = data['question_text'].apply(lambda x: len(set(str(x).split())))
    data['stop_words'] = data['question_text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stop_words]))
    data['uppercase_words'] = data['question_text'].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
    data['characters'] = data['question_text'].apply(lambda x: len(str(x)))

In [None]:
print("Training data set shape before adding the meta features : ", traindata.shape)
create_meta_features(traindata)
print("Training data set shape after adding the meta features : ", traindata.shape)

print("Testing data set shape before adding the meta features : ", testdata.shape)
create_meta_features(testdata)
print("Testing data set shape after adding the meta features : ", testdata.shape)

Training data set shape before adding the meta features :  (783673, 3)
Training data set shape after adding the meta features :  (783673, 10)
Testing data set shape before adding the meta features :  (522449, 2)
Testing data set shape after adding the meta features :  (522449, 9)


In [None]:
traindata[['negative_words', 'positive_words', 'unique_words', 'stop_words', 'uppercase_words', 'punctuations', 'characters']].to_csv('train_meta_features.csv', index=False)
testdata[['negative_words', 'positive_words', 'unique_words', 'stop_words', 'uppercase_words', 'punctuations', 'characters']].to_csv('test_meta_features.csv', index=False)

In [None]:
train_meta = pd.read_csv('train_meta_features.csv')
test_meta = pd.read_csv('test_meta_features.csv')

### Replacing few words.

In [None]:
to_replace = {
    'banglore': 'bangalore',
    'linsurance': 'insurance',
    'neighbour': 'neighbor',
    'favour': 'favor',
    'tringle': 'triangle',
    'favourite': 'favorite',
    'labour': 'labor',
    'newhouse': 'new house',
    'bitcoins': 'bitcoin',
    'centre': 'center',
    'theatre': 'theater',
    'quorans': 'quoran',
    'quoran': 'quoran',
    'origninal': 'original',
    'jewellery': 'jewelery',
    'gujaratis': 'gujarati',
    'fiendly': 'friendly',
    'organisation': 'organization',
    'behaviour': 'behavior',
    'iits': 'iit',
    'iims': 'iim',
    'iiits': 'iiit',
    'cryptocurrencies': 'cryptocurrency',
    'cancelled': 'canceled',
    'bengaluru': 'bangalore',
    'judgement': 'judgment',
    'infty': 'nifty',
    'fibre': 'fiber',
    'specialisation': 'specialization',
    'civilisation': 'civilization',
    'upvoting': 'upvote',
    'downvoting': 'downvote',
    'upvotes': 'upvote',
    'downvotes': 'downvote'
}

In [None]:
def replacewords(corpus):
    temp = []

    for sent in corpus:
        res = []
        for word in sent.split():
            res.append(to_replace.get(word, word))
    
        temp.append(' '.join(res))

    return temp

In [None]:
traindata['question_text'] = replacewords(traindata['question_text'])
testdata['question_text'] = replacewords(testdata['question_text'])

### Lemmatization and Removal of Stop Words.

In [None]:
traindata['question_text'] = refine_data(traindata, True)
testdata['question_text'] = refine_data(testdata, True)

In [None]:
traindata.to_csv('train_temp.csv', index=False)
testdata.to_csv('test_temp.csv', index=False)

### After removing Stopwords Looking for Null data.

In [None]:
traindata = pd.read_csv('train_temp.csv')
testdata = pd.read_csv('test_temp.csv')

In [None]:
traindata.isnull().sum()

qid               0
question_text    56
target            0
punctuations      0
dtype: int64

In [None]:
testdata.isnull().sum()

qid               0
question_text    32
punctuations      0
dtype: int64

In [None]:
traindata['question_text'].fillna('question number stop word punctuation', inplace=True)
testdata['question_text'].fillna('question number stop word punctuation', inplace=True)

In [None]:
traindata.to_csv('train_clean.csv', index=False)
testdata.to_csv('test_clean.csv', index=False)

# Converting to Vectors and training.

## Using Word2Vec

In [None]:
traindata = pd.read_csv('train_clean.csv')
testdata = pd.read_csv('test_clean.csv')
print('Training data Shape :', traindata.shape)
print('Testing data Shape :', testdata.shape)

Training data Shape : (783673, 4)
Testing data Shape : (522449, 3)


In [None]:
word2vec_path = '../GoogleNews-vectors-negative300.bin'
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [None]:
def get_embeddings(data):
    sentences = []

    for sent in data['question_text']:
        sentences.append(nltk.word_tokenize(sent))

    embeddings = np.zeros(shape=(len(sentences), 300))
    unseen_words = []

    for s_index, sent in enumerate(sentences):
        temp = np.zeros(shape=(len(sent), 300))
        count = 0
        for w_index, word in enumerate(sent):
            if word in word2vec_model.vocab:
                temp[w_index] = word2vec_model.get_vector(word)
            else:
                count = count + 1

        unseen_words.append(count)
        embeddings[s_index] = temp.mean(axis=0)

    return (embeddings, unseen_words)

In [None]:
embeddings, unseen_words = get_embeddings(traindata)

embeddings_df = pd.DataFrame(data=embeddings, columns=['d_' + str(i) for i in range(0, 300)])
traindata = pd.concat([traindata, embeddings_df], axis=1)

In [None]:
#embeddings, unseen_words = get_embeddings(testdata)

#embeddings_df = pd.DataFrame(data=embeddings, columns=['d_' + str(i) for i in range(0, 300)])
#testdata = pd.concat([testdata, embeddings_df], axis=1)

In [None]:
del word2vec_model

In [None]:
X = traindata.drop(columns=['question_text', 'qid', 'target'])
y = traindata['target']

In [None]:
X = pd.concat([X, train_meta], axis=1)
y = traindata['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, stratify=y)

In [None]:
logistic_classifier = LogisticRegression(max_iter=500, C=5)
logistic_classifier.fit(X_train, y_train)
pred_prob = logistic_classifier.predict_proba(X_test)[:, 1]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
print ("F1 Score: %0.3f " % bestThresshold(y_test, pred_prob))

100%|██████████| 41/41 [00:02<00:00, 15.11it/s]


Max occurs at : 0.18999999999999995
F1 Score: 0.523 





## Using TFIDF

In [None]:
traindata = pd.read_csv('train_clean.csv')
testdata = pd.read_csv('test_clean.csv')
print('Training data Shape :', traindata.shape)
print('Testing data Shape :', testdata.shape)

Training data Shape : (783673, 4)
Testing data Shape : (522449, 3)


In [None]:
corpus = pd.concat(objs=[traindata['question_text'], testdata['question_text']], ignore_index=True)

In [None]:
tfidfv = TfidfVectorizer(ngram_range=(1, 2), max_features=55000)
tfidf_fit = tfidfv.fit(corpus)

In [None]:
meta_features = ['negative_words', 'positive_words', 'unique_words', 'stop_words', 'uppercase_words', 'punctuations', 'characters']

In [None]:
def get_sparse_matrix(meta_data, data, model):
    vocab_features = model.transform(data)
    meta_matrix = csr_matrix(meta_data)
    vocab_features = hstack((vocab_features, meta_matrix), format='csr')

    return vocab_features

In [None]:
X = get_sparse_matrix(train_meta[meta_features], corpus[0: traindata.shape[0]], tfidf_fit)
y = traindata['target']
testdata_features = get_sparse_matrix(test_meta[meta_features], corpus[traindata.shape[0]: len(corpus)], tfidf_fit)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1024, stratify=traindata['target'])

In [None]:
def model_train_cv(Xtrain, ytrain, Xtest, folds=5, maxiter=500, random_state=1024):
    splits = list(StratifiedKFold(n_splits=folds, shuffle=True, random_state=1024).split(Xtrain, ytrain))

    test_prob = np.zeros(Xtest.shape[0])
    mul = 1 / folds
    
    for i in range(0, folds):
        tr = splits[i][0]
        te = splits[i][1]
        x_train_fold = Xtrain[tr]
        y_train_fold = ytrain.iloc[tr]
        x_val_fold = Xtrain[te]
        y_val_fold = ytrain.iloc[te]

        clf = LogisticRegression(C=4, solver='liblinear', max_iter=maxiter)
        clf.fit(x_train_fold, y_train_fold)
        test_prob += (mul * clf.predict_proba(Xtest)[:, 1])
    
    return test_prob

In [None]:
test_predictions = model_train_cv(X_train, y_train, X_test, 5, 800)
print ("F1 Score: %0.3f " % bestThresshold(y_test, test_predictions))

100%|██████████| 41/41 [00:02<00:00, 15.37it/s]


Max occurs at : 0.24999999999999992
F1 Score: 0.601 





In [None]:
multinomialnb_classifier = MultinomialNB()
multinomialnb_classifier.fit(X_train, y_train)
pred = multinomialnb_classifier.predict(X_test)
print ("F1 Score: %0.3f " % bestThresshold(y_test, pred))

100%|██████████| 41/41 [00:02<00:00, 15.82it/s]


Max occurs at : 0.1
F1 Score: 0.444 





# Best Submission Code

In [None]:
traindata = pd.read_csv('train.csv')
testdata = pd.read_csv('test.csv')

In [None]:
traindata['question_text'] = traindata['question_text'].apply(lambda x: clean_contractions(x, contraction_mapping))
testdata['question_text'] = testdata['question_text'].apply(lambda x: clean_contractions(x, contraction_mapping))

In [None]:
data = pd.concat(objs=[traindata['question_text'], testdata['question_text']], ignore_index=True)

In [None]:
corpus = []

for question in data:
    words = re.sub('[^\w\s]', '', question)
    corpus.append(words.lower())

print(len(corpus))

1306122


In [None]:
tfidfv = TfidfVectorizer(ngram_range=(1, 4), max_features=40000, strip_accents='unicode', sublinear_tf=True, analyzer='char')
tfidf_fit = tfidfv.fit(corpus)

#X = tfidf_fit.transform(corpus[0: traindata.shape[0]])
#y = traindata['target']
#testdata_features = tfidf_fit.transform(corpus[traindata.shape[0]: ])

In [None]:
meta_features = ['negative_words', 'positive_words', 'unique_words', 'uppercase_words', 'punctuations']

In [None]:
X = get_sparse_matrix(train_meta[meta_features], corpus[0: traindata.shape[0]], tfidf_fit)
y = traindata['target']
testdata_features = get_sparse_matrix(test_meta[meta_features], corpus[traindata.shape[0]: len(corpus)], tfidf_fit)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1024, stratify=traindata['target'])

In [None]:
test_predictions = model_train_cv(X_train, y_train, X_test, 5, 800)
print ("F1 Score: %0.3f " % bestThresshold(y_test, test_predictions))

100%|██████████| 41/41 [00:02<00:00, 15.55it/s]


Max occurs at : 0.2699999999999999
F1 Score: 0.625 





### Submission file Creation

In [None]:
test_predictions = model_train_cv(X, y, testdata_features, 5, 800)

pred = (test_predictions > 0.25).astype(np.int8)
submissiondf = pd.DataFrame(data=testdata['qid'])
submissiondf['target'] = pred
submissiondf.to_csv('Submission.csv', index=False)

print('Submisssion File Shape : ', submissiondf.shape)

Submisssion File Shape :  (522449, 2)
