In [None]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Read data
df=pd.read_csv('/kaggle/input/60k-stack-overflow-questions-with-quality-rate/data.csv')

## Which stackoverflow questions should be closed?<br>
Now, we have three types of question classes: high quality open questions (HQ), low quality close questions (LQ_CLOSE), and  low quality open questions (LQ_EDIT). We have 20k samples of each of these question types.<br>
But, considering from the organization's perspective, we want to build predictive model that will tell us types (class) of the question from the future date based on the historical samples we have in the organization.

In [None]:
df.Y.value_counts()

For this purpose, we will set questions from year 2019 as our validation set and all questions before the year 2019 as our training set. Also, will do label encoding for target variable, where LQ_CLOSE becomes 0, HQ becomes 1, and LQ_EDIT becomes 2

In [None]:
# Create feature CreationYear and remove feature CreationDate
df['CreationYear']=df.CreationDate.apply(lambda val:int(val.split()[0].split('-')[0]))
del df['CreationDate']
gc.collect()

# Label encode target feature
df.Y.replace({'LQ_CLOSE':0,'HQ':1,'LQ_EDIT':2},inplace=True)

# Create train and test dataframes
train_df=df[['Title','Body','Tags','Y']][df.CreationYear<2019].copy()
test_df=df[['Title','Body','Tags','Y']][df.CreationYear>=2019].copy()

# Delete main dataframe to clear some memory
del df
gc.collect()

train_df.shape, test_df.shape

Our next task is going to be the most important section of the notebook. We now want to build some features that will help our model in prediction. Before we build any feature, we will clean-up the raw html text available in the data. Firstly, we will remove all tag brackets from the Tags feature.

In [None]:
def clean_tags(string):
    return ((string.replace('><',' ')).replace('<','')).replace('>','')

for df in [train_df,test_df]:
    df['Tags']=list(map(lambda val:clean_tags(val), df.Tags.values))

Next, we want to extract useful text from the Body of the html code, that is available in the Body feature of the dataframe. For this task we are using libraries BeautifulSoup, nltk, re.<br>
Along with that, we will also add three new indicator features for code snippet, reference link tag, image tag.

In [None]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
import re

In [None]:
def clean_body_text(df):
    # Create list of english stopwords from nltk library
    stop_words = set(stopwords.words('english'))

    # Create a list to save body text of all questions
    body_text=[]
    # Create a list to indicate if code snippet is present in the body
    code_indicator=[]
    reference_link_indicator=[]
    image_indicator=[]

    for ind in tqdm(range(df.shape[0])):

        # Create a BeautifulSoup object
        q_body=df['Body'].values[ind].lower()
        soup=BeautifulSoup(q_body)
        
        # To check if body contains code snippet
        if len(soup.findAll('code'))>0:
            code_indicator.append(1)
            # Find all code tags and replace them with empty string ''
            for code_text in soup.findAll('code'):
                code_text.replace_with('')
        else:
            code_indicator.append(0)
        
        # To check if body contains reference link tag
        if len(soup.findAll('a'))>0:
            reference_link_indicator.append(1)
        else:
            reference_link_indicator.append(0)

        # To check if body contains image
        if len(soup.findAll('img'))>0:
            image_indicator.append(1)
        else:
            image_indicator.append(0)            

        # Create a list to save all <p> tag text of a question into a list
        text=[]
        for line in soup.findAll('p'):
            line=line.get_text()
            line=line.replace('\n','')
            line=re.sub(r'[^A-Za-z0-9]', ' ', line)
            line=' '.join([word for word in line.split() if not word in stop_words])
            text.append(line)

        body_text.append(' '.join(text))

    return body_text, code_indicator, reference_link_indicator, image_indicator

In [None]:
train_df['body_text'],train_df['code_indicator'],train_df['reference_link_indicator'],train_df['image_indicator']=clean_body_text(train_df)
test_df['body_text'],test_df['code_indicator'],test_df['reference_link_indicator'],test_df['image_indicator']=clean_body_text(test_df)

Similarly, we will clean html titles from the Title feature of the dataframe.

In [None]:
def clean_title_text(df):
    # Create list of english stopwords from nltk library
    stop_words = set(stopwords.words('english'))
    title_text=[]
    for ind in range(df.shape[0]):
        text=df.Title.values[ind].lower()
        text=text.replace('\n','')
        text=re.sub(r'[^A-Za-z0-9]', ' ', text)
        text=' '.join([word for word in text.split() if not word in stop_words])

        title_text.append(text)
        
    return title_text

In [None]:
train_df['title_text']=clean_title_text(train_df)
test_df['title_text']=clean_title_text(test_df)

In [None]:
del train_df['Title'], train_df['Body'], test_df['Title'], test_df['Body']
gc.collect()

Let's now seperate target features from the predictor features from both the dataframes

In [None]:
train_y=train_df['Y']
test_y=test_df['Y']

del train_df['Y'], test_df['Y']
gc.collect()

In [None]:
train_df.shape, test_df.shape, train_y.shape, test_y.shape

We will now, perform tfidf vectorizer on the text features, i.e., title_text, body_text, Tags

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy

In [None]:
train_tfidf=[]
test_tfidf=[]
for feat in tqdm(train_df.select_dtypes(include='object').columns):
    vectorizer=TfidfVectorizer(ngram_range=(1,4),max_features=10000)
    train_tfidf.append(vectorizer.fit_transform(train_df[feat]))
    test_tfidf.append(vectorizer.transform(test_df[feat]))

Let's now convert list of tfidf vectors into a stacked sparse matrix. As our base model will be logistic regression, having sparse matrix speeds-up the training process.

In [None]:
train_tfidf=scipy.sparse.hstack(train_tfidf).tocsr()
test_tfidf=scipy.sparse.hstack(test_tfidf).tocsr()

train_tfidf.shape, test_tfidf.shape

Let's now stack tfidf features and numeric features togather

In [None]:
train_x=scipy.sparse.hstack([train_tfidf, train_df[['code_indicator','reference_link_indicator','image_indicator']].values]).tocsr()
test_x=scipy.sparse.hstack([test_tfidf, test_df[['code_indicator','reference_link_indicator','image_indicator']].values]).tocsr()

In [None]:
train_x.shape, test_x.shape

Let's use simplest of all classification models, logistic regression, as our baseline model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
lr=LogisticRegression(max_iter=1000,n_jobs=-1)
lr.fit(train_x,train_y)

In [None]:
train_y_pred=lr.predict(train_x)
test_y_pred=lr.predict(test_x)

In [None]:
print('Mean accuracy score:',lr.score(train_x,train_y))

fig,ax=plt.subplots(figsize=(8,8))
sns.heatmap(metrics.confusion_matrix(train_y,train_y_pred),annot=True,cbar=False,fmt='d',cmap='Reds')
ax.set_ylabel('True label',fontsize=14)
ax.set_xlabel('Predicted label',fontsize=14)
ax.set_title('Confusion matrix: Train set prediction',fontsize=16);

In [None]:
print('Mean accuracy score:',lr.score(test_x,test_y))

fig,ax=plt.subplots(figsize=(8,8))
sns.heatmap(metrics.confusion_matrix(test_y,test_y_pred),annot=True,cbar=False,fmt='d',cmap='Reds')
ax.set_ylabel('True label',fontsize=14)
ax.set_xlabel('Predicted label',fontsize=14)
ax.set_title('Confusion matrix: Test set prediction',fontsize=16);

Surely, we have overfitted to the data as train mean accuracy is at 94% where test accuracy is at 79%.<br>
Next, step is to apply advance machine learning models. So please upvote for the motivation. Update coming soon...