In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
import re
import html.parser

html_parser = html.parser.HTMLParser()
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current sessionpip3 install beautifulsoup4

In [None]:
train_set = pd.read_csv('/kaggle/input/60k-stack-overflow-questions-with-quality-rate/train.csv')
train_set.head()

In [None]:
train_set.shape

# **Checking for data imbalance**

In [None]:
train_set['Y'].value_counts()

# **Data Cleaning**
1. Normalising HTML elements.
2. Removing non alphabetical characters.
3. Removing stop words.

In [None]:
def data_cleaning(text):
    def escaping_html_chars(text):
        parsed_text = html_parser.unescape(text)
        return parsed_text
    
    def clean(text):
        text = re.sub('[^\w\s]', "", text.lower())
        text = " ".join([word for word in text.split() if word not in STOP_WORDS])
        return text

    text = escaping_html_chars(text)
    text = clean(text)    
    return text

In [None]:
train_set['text'] = train_set['Title'] + ' ' + train_set['Body']
train_set['text_cleaned'] = train_set['text'].apply(data_cleaning)
train_set[['text_cleaned', 'text', 'Y']].head()

# **Feature Extraction and Engineering**
Limiting features to 5000 to reduce the model complexity and to improve the model interpretability.

In [None]:
vect = TfidfVectorizer(max_features=5000)
features = vect.fit_transform(train_set['text_cleaned'])
features.shape

In [None]:
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(train_set['Y'])
encoded_labels

# **Model Building**
Using SVM because the texts has lots of features that are represented as vectors in space and for that SVM is helpful in extracting as much as information possible from the vectors. The linear kernal is used to improve the training speed and most of the Text is often linearly separable.

In [None]:
model = LinearSVC()
model.fit(features, encoded_labels)
model.score(features, encoded_labels)

# **Model Testing**

In [None]:
valid_set = pd.read_csv('/kaggle/input/60k-stack-overflow-questions-with-quality-rate/valid.csv')
valid_set.head()

In [None]:
valid_set['text'] = valid_set['Title'] + ' ' + valid_set['Body']
valid_set['text_cleaned'] = valid_set['text'].apply(data_cleaning)
valid_set[['text_cleaned', 'text', 'Y']].head()

In [None]:
valid_features = vect.transform(valid_set['text_cleaned'])
pred = model.predict(valid_features)
encoded_labels_valid = encoder.transform(valid_set['Y'])
print(classification_report(encoded_labels_valid, pred))

In [None]:
plot_confusion_matrix(model, valid_features, encoded_labels_valid, display_labels=['LQ_CLOSE', 'HQ', 'LQ_EDIT'], cmap=plt.cm.Blues)

In [None]:
def f_importances(coef, names, top_n, label):
    imp,names = zip(*sorted(zip(coef,names), reverse=True))
    plt.barh(range(top_n), imp[:top_n], align='center')
    plt.yticks(range(top_n), names[:top_n])
    plt.title("Top 10 features for class "+label)
    plt.show()
    

features_names = vect.get_feature_names()
class_name = 2
f_importances(model.coef_[0], features_names, top_n=10, label="LQ_CLOSE")
f_importances(model.coef_[1], features_names, top_n=10, label="HQ")
f_importances(model.coef_[2], features_names, top_n=10, label="LQ_EDIT")
# model.coef_[0].shape
# vect.get_feature_names
# model.coef_[class_name], features_names