# Import all libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

import re
from fuzzywuzzy import fuzz
from tqdm import tqdm

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem.snowball import SnowballStemmer

import xgboost as xgb

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [None]:
pd.set_option("display.max_rows", None) 
  

# Preprocessing data

In [None]:
data = pd.read_csv("/kaggle/input/60k-stack-overflow-questions-with-quality-rate/train.csv")
data.head()

# **Exploratory Data analysis**

In [None]:
def replace_punct(text):
    text = text.replace("><",",")
    text = text.replace("<","")
    text = text.replace(">","")
    
    return text

data['Tags_cleaned'] = data['Tags'].apply(replace_punct)
    
    

In [None]:
data.head()

## Analysis Of Tags

Total number of unique tags

In [None]:
vector = CountVectorizer(tokenizer=lambda x:x.split(","))
tag_trans = vector.fit_transform(data['Tags_cleaned'])

In [None]:
print("Number of tags are {}".format(tag_trans.shape[1]))

In [None]:
tags = vector.get_feature_names()
print("some of tags are {}".format(tags[40:100]))

**Number of times tag appears**

In [None]:
freq = tag_trans.sum(axis=0).A1
freq_dict=dict(zip(tags,freq))

In [None]:
freq_dict['.net']

In [None]:
tag_freq_df = pd.DataFrame.from_dict(freq_dict, orient='index', columns=['Count']).reset_index(drop=False)
tag_df_sorted = tag_freq_df.sort_values(['Count'], ascending=False).reset_index(drop=True)
tag_df_sorted.head()

In [None]:
tag_df_sorted.tail(10)

In [None]:
tag_counts = tag_df_sorted['Count'].values
plt.plot(tag_counts[0:150])
plt.title("Distribution of frequency of Tags Appeared")
plt.grid()
plt.ylabel("Number of times tag appeared")
plt.xlabel("Tag Number")
plt.show()

In [None]:
tags_final = tag_df_sorted[tag_df_sorted.Count>3]

In [None]:
tags_final.shape

In [None]:
final_tags = list(tags_final['index'].values)

In [None]:
final_tags[:10]

In [None]:
def tag_remove(text):
    text_list = text.split(",")
    text_list = ",".join(list(set(text_list) & set(final_tags)))
    return text_list

In [None]:
data['Tags_final'] = data['Tags_cleaned'].apply(tag_remove)

In [None]:
data['Tags_cleaned'].apply(lambda x:len(x.split(","))).equals(data['Tags_final'].apply(lambda x:len(x.split(","))))

In [None]:
data = data.drop(['Tags', 'CreationDate','Tags_cleaned'], axis=1)
data['Y'] = data['Y'].map({'LQ_CLOSE':0, 'LQ_EDIT': 1, 'HQ':2})
data.head()


In [None]:
data.loc[10,'Body']

In [None]:
def striphtml(data):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(data))
    return cleantext

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

In [None]:
data['code'] = data['Body'].apply(lambda x: re.findall(r'<code>(.*?)</code>', x, flags=re.DOTALL))

In [None]:
data.head(11)

In [None]:
data.loc[3, 'Body']

In [None]:
data['question'] = data['Body'].apply(lambda x:re.sub('<code>(.*?)</code>', '', x, flags=re.MULTILINE|re.DOTALL))
data['question'] = data['question'].apply(lambda x: striphtml(x))

In [None]:
data.loc[10,'code'][1]

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^(a-zA-Z)\s]','', text)
    return text

In [None]:
#data['Title'] = data['Title'].apply(lambda x:re.findall(r'b(.*?)', x, flags=re.DOTALL))
#data['Title'] = data['Title'].apply(lambda x:x.encode('utf-8'))

In [None]:
data['question'] = data['Title'].astype(str) + data['question'].astype(str)
data['question'] = data['question'].apply(clean_text)


In [None]:
data.loc[31,'Body']

In [None]:
data.loc[31,'question']

In [None]:
data.loc[31,'Title']

In [None]:
stop_words = set(stopwords.words('english')) 

def remove_stopword(words):
    list_clean = [w for w in words.split(' ') if not w in stop_words]
    
    return ' '.join(list_clean)

def remove_next_line(words):
    words = words.split('\n')
    
    return " ".join(words)

def remove_r_char(words):
    words = words.split('\r')
    
    return "".join(words)

In [None]:
data['question'] = data['question'].apply(remove_stopword)
data['question'] = data['question'].apply(remove_next_line)
data['question'] = data['question'].apply(remove_r_char)

In [None]:
distribution = data.groupby('Y')['Body'].count().reset_index()

In [None]:
distribution

# Making basic Features


In [None]:
data['Num_words_body'] = data['Body'].apply(lambda x:len(str(x).split())) #Number Of words in Selected Text
data['Num_words_title'] = data['Title'].apply(lambda x:len(str(x).split())) #Number Of words in main text
data['difference_in_words'] = abs(data['Num_words_body'] - data['Num_words_title']) #Difference in Number of words text and Selected Text

In [None]:
data['Num_char_body'] = data['Body'].apply(lambda x:len("".join(set(str(x).replace(" ",""))))) 
data['Num_char_title'] = data['Title'].apply(lambda x:len("".join(set(str(x).replace(" ","")))))

In [None]:
data['len_common_words'] = data.apply(lambda x:len(set(str(x['Title']).split()).intersection(set(str(x['Body']).split()))),axis=1)

In [None]:
data.head(3)

# **Make Fuzzy features**

In [None]:
data['fuzz_qratio'] = data.apply(lambda x:fuzz.QRatio(str(x['Title']),str(x['Body'])), axis=1)
data['fuzz_Wratio'] = data.apply(lambda x:fuzz.WRatio(str(x['Title']),str(x['Body'])), axis=1)
data['fuzz_partial_ratio'] = data.apply(lambda x:fuzz.partial_ratio(str(x['Title']),str(x['Body'])), axis=1)
data['fuzz_partial_token_set_ratio'] = data.apply(lambda x:fuzz.partial_token_set_ratio(str(x['Title']),str(x['Body'])), axis=1)
data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x:fuzz.partial_token_sort_ratio(str(x['Title']),str(x['Body'])), axis=1)
data['fuzz_token_set_ratio'] = data.apply(lambda x:fuzz.token_set_ratio(str(x['Title']),str(x['Body'])), axis=1)
data['fuzz_token_sort_ratio'] = data.apply(lambda x:fuzz.token_sort_ratio(str(x['Title']),str(x['Body'])), axis=1)

In [None]:
data.head(3)

# Split train test data

In [None]:
data['Body_with_title'] = data['Title'] + " " + data['Body']

In [None]:
xtrain, xvalid, ytrain, yvalid = train_test_split(data.drop(['Id','Title','Body','Y'],axis=1).values, data['Y'].values, 
                                                  stratify=data['Y'].values, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

In [None]:
len(ytrain)

In [None]:
len(yvalid)

In [None]:
yvalid

In [None]:
def get_accuracy(clf, predictions, yvalid):
    return np.mean(predictions == yvalid)

# Make Tf-idf features

In [None]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

In [None]:
tfv.fit(list(xtrain[:,-1]))
xtrain_tfv =  tfv.transform(xtrain[:,-1]) 
xvalid_tfv = tfv.transform(xvalid[:,-1])

# Count vectorizer Model for Comparison with TF-IDF

In [None]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(xtrain[:,-1])
xtrain_ctv =  ctv.transform(xtrain[:,-1]) 
xvalid_ctv = ctv.transform(xvalid[:,-1])


# Fit a simple Logistic regression Model on tf-idf

In [None]:
# Fitting a simple Logistic Regression on TFIDF
clf = LogisticRegression()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

In [None]:
clf_ctv = LogisticRegression()
clf_ctv.fit(xtrain_ctv, ytrain)
predictions_ctv = clf_ctv.predict_proba(xvalid_ctv)

In [None]:
predictions

In [None]:
from sklearn.metrics import multilabel_confusion_matrix
multilabel_confusion_matrix(yvalid, predictions)


In [None]:
multilabel_confusion_matrix(yvalid, predictions_ctv)

In [None]:
get_accuracy(clf, predictions, yvalid)

In [None]:
get_accuracy(clf_ctv, predictions_ctv, yvalid)

# Fit an Xgboost on tf-idf features

In [None]:
clf = xgb.XGBClassifier(max_depth=10, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict(xvalid_tfv)

In [None]:
get_accuracy(clf, predictions, yvalid)

In [None]:
multilabel_confusion_matrix(yvalid, predictions

In [None]:
clf_ctv = xgb.XGBClassifier(max_depth=10, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf_ctv.fit(xtrain_ctv, ytrain)
predictions_ctv = clf_ctv.predict(xvalid_ctv)

In [None]:
get_accuracy(clf_ctv, predictions_ctv, yvalid)

# Fit a Naive bayes Model on tf-idf only

In [None]:
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict(xvalid_tfv)


In [None]:
get_accuracy(clf, predictions, yvalid)

In [None]:
clf_ctv = MultinomialNB()
clf_ctv.fit(xtrain_ctv, ytrain)
predictions_ctv = clf_ctv.predict(xvalid_ctv)

In [None]:
get_accuracy(clf_ctv, predictions_ctv, yvalid)

# Fit an SVD on tf-idf features only

In [None]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=180)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [None]:
clf = SVC(C=1.0) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)

In [None]:
predictions = clf.predict(xvalid_svd_scl)

In [None]:
get_accuracy(clf, predictions, yvalid)

# Fitting Xgboost on tf-idf-SVD feature

In [None]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_svd, ytrain)
predictions = clf.predict(xvalid_svd)

In [None]:
get_accuracy(clf, predictions, yvalid)

# Grid Search

In [None]:
mll_scorer = metrics.make_scorer(get_accuracy, greater_is_better=True, needs_proba=False)

In [None]:
svd = TruncatedSVD()
    
# Initialize the standard scaler 
scl = preprocessing.StandardScaler()

# We will use logistic regression here..
xg_model = xgb.XGBClassifier()

# Create the pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('xg', xg_model)])

In [None]:
param_grid = {'svd__n_components' : [120, 150, 180],
              'xg__max_depth':[5,7,10],
              'xg__learning_rate':[0.1,0.01,0.5]}


model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

model.fit(xtrain_tfv, ytrain)  # we can use the full data here but im only using xtrain
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

# Make Word vector features(Still improving)

In [None]:
def read_glove_vecs(glove_file):
    #input: file
    #output: word to 200d vector mapping output
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
    return word_to_vec_map
#word_to_vec_map = read_glove_vecs('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt')
word_to_vec_map = read_glove_vecs('../input/glovetwitter27b100dtxt/glove.twitter.27B.200d.txt')

In [None]:
def prepare_sequence(ds, word_to_vec_map):
    #input: Series, and word_to_vec_map of size(vocab_size,200)
    #output: returns shape of (len(ds), 200)
    traintest_X = []
    for sentence in tqdm(ds):
        sequence_words = np.zeros((word_to_vec_map['cucumber'].shape))
        for word in sentence.split():
            if word in word_to_vec_map.keys():
                temp_X = word_to_vec_map[word]
            else:
                temp_X = word_to_vec_map['#']
            #print(temp_X)
            sequence_words+=(temp_X)/len(sentence)
            #print(sequence_words)
        traintest_X.append(sequence_words)
    return np.array(traintest_X)


In [None]:
prepare_sequence(xtrain[:,-1][0], word_to_vec_map)

In [None]:
#concatenate all sequences for training and testing set
train_w2v = prepare_sequence(xtrain[:,-1], word_to_vec_map)
valid_w2v = prepare_sequence(xvalid[:,-1], word_to_vec_map)

In [None]:
clf = LogisticRegression()
clf.fit(train_w2v, ytrain)
predictions = clf.predict(valid_w2v)

In [None]:
get_accuracy(clf, predictions, yvalid)

In [None]:
clf = xgb.XGBClassifier(max_depth=15, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(train_w2v, ytrain)
predictions = clf.predict(valid_w2v)


In [None]:
get_accuracy(clf, predictions, yvalid)

In [None]:
clf2 = xgb.XGBClassifier(max_depth=10, n_estimators=150, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf2.fit(train_w2v, ytrain)


In [None]:
predictions = clf2.predict(valid_w2v)

In [None]:
get_accuracy(clf2, predictions, yvalid)

In [None]:
final_xtrain = np.concatenate((xtrain[:,:-1],train_w2v), axis=1)
final_xvalid = np.concatenate((xvalid[:,:-1],valid_w2v),axis=1)

In [None]:
clf = LogisticRegression()
clf.fit(final_xtrain, ytrain)
predictions = clf.predict(final_xvalid)

In [None]:
get_accuracy(clf, predictions, yvalid)

In [None]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(final_xtrain, ytrain)
predictions = clf.predict(final_xvalid)

In [None]:
get_accuracy(clf, predictions, yvalid)