# Prediciting the Quality of Stackoverflow Questions

## 1. Import Data

In [None]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# Define Data Path
DIR = os.getcwd() # current working directory
train_path = "stack_overflow_questions_train.csv"
val_path  = "stack_overflow_questions_valid.csv"

In [None]:
# Load Training Data
TRAIN_PATH = os.path.join(DIR,"data",train_path)
train_df = pd.read_csv(TRAIN_PATH)

train_df.head()

In [None]:
# Load Test Data
VAL_PATH = os.path.join(DIR,"data",val_path)
val_df = pd.read_csv(VAL_PATH)

val_df.head()

In [None]:
# Print proportion of train and val data
total_data = (train_df.shape[0]+val_df.shape[0])
print(f" Train Data: Count = {train_df.shape[0]}   Percentage of Data = {round(train_df.shape[0]*100/ total_data,2)}")
print(f" Train Data: Count = {val_df.shape[0]}   Percentage of Data = {round(val_df.shape[0]* 100/ total_data, 2)}")

In [None]:
# Create a combined data frame of train and validation data
combined_df = pd.concat([train_df, val_df])

# Check that data is stacked along rows dimension
assert combined_df.shape[0] == total_data 

## 2. Data Exploration & Preparation

* Target Distribution

In [None]:
fig, axs = plt.subplots(1,2, figsize=(9, 2))
train_df["Y"].hist(ax=axs[0])
axs[0].set_title(f"Train Data Question Quality ")
axs[0].set_ylabel("Count")
axs[0].set_xlabel("Quality Label")

val_df["Y"].hist(ax=axs[1])
axs[1].set_title(f"Val. Data Question Quality ")
axs[1].set_ylabel("Count")
axs[1].set_xlabel("Quality Label")

* Check for Nulls

In [None]:
# Check data types and nulls
combined_df.info()

* Convert Body text from XML to string

In [None]:
from bs4 import BeautifulSoup
import re

# Function to remove XML formatting
def convert_text(text):
    text = BeautifulSoup(text, "xml").get_text()                # parse and get text      
    text = text.replace('\r', ' ').replace('\n', ' ').strip()   # remove formatting
    return text

# Count paragraphs and change from xml format
dfs = [train_df, val_df]
for df in dfs:
    # Count paragraphs
    df['Paragraphs'] = df['Body'].apply(lambda x: len(re.findall(r'</p>\n', x)))

    # Apply the convert_text function to the 'Body' column
    df['Body'] = df['Body'].apply(convert_text)

# View converted text
train_df.head()

* Clean & Reformat Features
  

In [None]:
import re
import cleantext
from bs4 import BeautifulSoup
import contractions
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt_tab')

# Clean and reformat data
stop_words = set(stopwords.words('english')) # Define stop words

# For both the test and traing data
for df in dfs:
    
    # Change Creation Date to Year
    df["CreationDate"] = pd.to_datetime(df["CreationDate"]).dt.year
    
    # Remove id columns
    df.drop(["Id"], axis=1,inplace=True) 

    # Handle user mentions
    count_user_mentions = combined_df["Body"].loc[combined_df["Body"].str.contains(r'@\w+')].shape[0]
    df["Body"] = df["Body"].apply(lambda x: re.sub(r'@\w+', '_USER_', x))   # Replace unique user names with @USER
    
    # Replace tag with count of the number of tags
    df["Tags"] = df["Tags"].apply(lambda x: len(re.findall(r'<[^>]+>', x)))
    
    # Clean text columns
    for col in ["Title", "Body"]:
        # Remove contractions
        df[col] = df[col].apply(lambda x:  contractions.fix(x))

        # Clean Words
        df[col] = df[col].apply(lambda x: cleantext.clean( x,
            fix_unicode=True,               # fix various unicode errors
            to_ascii=True,                  # transliterate to closest ASCII representation
            lower=True,                     # lowercase text
            no_line_breaks=True,            # fully strip line breaks as opposed to only normalizing them
            no_urls=True,                   # replace all URLs with a special token
            no_emails=True,                 # replace all email addresses with a special token
            no_phone_numbers=True,          # replace all phone numbers with a special token
            no_numbers=True,                # replace all numbers with a special token
            no_digits=False,                # replace all digits with a special token
            no_currency_symbols=True,       # replace all currency symbols with a special token
            no_punct=True,                  # remove punctuations
            no_emoji=True,                  # remove emjois
            replace_with_punct=" ",          # instead of removing punctuations you may replace them
            replace_with_url="_URL_",
            replace_with_email="_EMAIL_",
            replace_with_phone_number="_PHONE_",
            replace_with_number="_NUMBER_",
            replace_with_currency_symbol="_CUR_",
            lang="en"                       # set to english handling
        ))
        
        # Remove double spaces
        df[col] = df[col].apply(lambda x: re.sub('  ', ' ', x))
        
        # Remove stop words + word tokenization
        df[col] = df[col].apply(lambda x:  [w for w in  word_tokenize(x) if w not in stop_words])
    

In [None]:
train_df.head()

In [None]:
# Handle spelling mistakes
# from textblob import TextBlob

# # For both the test and traing data
# for df in dfs:
#     for col in ["Title", "Body"]:
#         df[col] = df[col].apply(lambda x:  ''.join(TextBlob(x).correct()))

* Lemmatization

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

for df in dfs:
    for col in ["Title", "Body"]:
        df[col] = df[col].apply(lambda x:  " ".join([token.lemma_ for token in nlp(str(x))]))
        
train_df.head()

* Count Number of unique Words in Body

In [None]:
combined_df = pd.concat([train_df, val_df])

for col in ["Title", "Body"]:
    lemmatized_tokens = []
    for doc in combined_df[col]:
        doc = re.sub(r"[',\[\]]", "", doc)
        lemmatized_tokens.extend(doc.split())

    # Get unique lemmas
    unique_lemmas = lemmatized_tokens
    vocabulary_size = len(unique_lemmas)
    print(f"{col} - vocabulary size is {vocabulary_size}")

* Title: Get count of overlapping words with Body

In [None]:
for df in dfs:
    # find the number of intersecting words in Title and Body
    df["Title_Count"] = df.apply(lambda row: len(set( re.sub(r"[',\[\]]", "", row["Title"]).split()) & set(re.sub(r"[',\[\]]", "", row["Body"]).split())), axis=1)
    
train_df.head()

* Body: Get subjectivity and polarity

In [None]:
from textblob import TextBlob

# Function to get subjectivity and polarity of the text
def get_sentiment(text):
        blob = TextBlob(text)
        polarity = blob.sentiment.polarity
        subjectivity = blob.sentiment.subjectivity
        return pd.Series([polarity, subjectivity])

# Add subjectivity and polarity of "Body" to dataframe 
for df in dfs:
    df[["Polarity", "Subjectivity"]] = df["Body"].apply(lambda x: get_sentiment(x))



In [None]:
# View Polarity and Subjectivity
sns.scatterplot(data=train_df, x="Polarity", y="Subjectivity", hue="Y")
plt.title("Subjectivity vs Polarity of Body coloured by Question Quality")
plt.xlabel("Polarity")
plt.ylabel("Subjectivity")
plt.legend()
plt.show()


* Body: Embed with CountVectorizer (computationally efficient)

    The vocabulary was very large which would be time consuming even with the cmputationally efficient embedder so this step was omitted

* Generate Word Cloud

In [None]:
from wordcloud import WordCloud

# Generate word cloud of  "Body"
cloud = WordCloud() 
cloud = cloud.generate(" ".join(train_df["Body"].astype(str).tolist()))

# Plot word cloud
plt.imshow(cloud)
plt.axis('off')  
plt.show()

* View histogram of data

In [None]:
train_df.hist()

*  Add Word Count Var

In [None]:
for df in dfs:
    df["Body_Count"] = df["Body"].apply(lambda x: len(set(re.sub(r"[',\[\]]", "", x).split())))
    
train_df["Body_Count"].head()

* View how features impact target

In [None]:
fig, axs = plt.subplots(1, 4, figsize=(17, 3))
axs = axs.flatten()

# Plot Edit
sns.boxplot(data=train_df, x="Y", y="Body_Count", hue="Y", ax=axs[0], medianprops=dict(color="red", linewidth=1.5, linestyle='--'))
axs[0].set_title(f" Distribution of Word Counts in Target")
axs[0].set_ylabel("Number of Unique Words in Body")
axs[0].set_xlabel("Category (Y)")

# Plot Year
sns.boxplot(data=train_df, x="Y", y="CreationDate", hue="Y", ax=axs[1], medianprops=dict(color="red", linewidth=1.5, linestyle='--'))
axs[1].set_title(f"Distribution of CreationDate in Target")
axs[1].set_ylabel("Creation Date")
axs[1].set_xlabel("Category (Y)")

# Plot Tags
sns.boxplot(data=train_df, x="Y", y="Tags", hue="Y", ax=axs[2], medianprops=dict(color="red", linewidth=1.5,linestyle='--'))
axs[2].set_title(f"Distribution of Tags in Target")
axs[2].set_ylabel("Count Tags")
axs[2].set_xlabel("Category (Y)")

# Plot Paragraphs
sns.boxplot(data=train_df, x="Y", y="Paragraphs", hue="Y", ax=axs[3], medianprops=dict(color="red", linewidth=1.5,linestyle='--'))
axs[3].set_title(f"Distribution of Paragraphs in Target")
axs[3].set_ylabel("Count Paragraphs")
axs[3].set_xlabel("Category (Y)")

plt.tight_layout()
plt.show()

## 2. Data Preparation

In [None]:
# Transform Tags and Paragraphs to make distribution more symmetric

for df in dfs:
    for col in ["Tags", "Paragraphs"]:
        df[col] = np.log(df[col])

In [None]:
# View data
train_df.describe()

In [None]:
# Data Standardization
from sklearn.preprocessing import StandardScaler

# define columns to scale
scale_list = train_df.columns
scale_list.remove("Y", "Body", "Title")

# Scale training data
train_scaler = StandardScaler()
train_df[scale_list] = train_scaler.fit_transform(train_df[scale_list])

# Scale training data
val_scaler = StandardScaler()
val_df[scale_list] = val_scaler.fit_transform(val_df[scale_list])

In [None]:
# View data
train_df.describe()

  ## 2. Predictive Models

In [None]:
# Encoder only transformer

# Dummy classifier

## 3. Results