In [None]:
!pip install --upgrade seaborn

In [None]:
import gc
import re
import string
import numpy as np 
import random
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker

%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


import nltk
from nltk.corpus import stopwords

from tqdm import tqdm_notebook as tqdm
import os
import nltk
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch

import warnings
warnings.filterwarnings("ignore")

In [None]:
nltk.download("stopwords")

# Helper functions

In [None]:
def func(pct, allvalues):
    absolute = int(pct /100.*np.sum(allvalues))
    return "{:.1f}%".format(pct)
    pass


def remove_single_word_num(sent):
    '''
    Remove numbers and words of single length such as "x + 23 y - abc" will become "+ - abc"
    '''
    dummy_list = []
  
    for token in sent.split():
        if (not token.isdigit()) and ((token.isalpha() and len(token)>1) or (not token.isalnum())):
            dummy_list.append(token)   
  
    return ' '.join(dummy_list)


def insert_spaces(sentence):
    '''
    Add a space around special characters, number and digits. So "2x+y -1/3x" becomes: "2 x + y - 1 / 3 x"
    '''
    dummy_list = []
    splitted_sent = list(sentence)
    
    for i in range(len(splitted_sent)-1):
        dummy_list.append(splitted_sent[i])
        
        if splitted_sent[i].isalpha(): # if it is an alphabet
            if splitted_sent[i+1].isdigit() or (not splitted_sent[i+1].isalnum()):
                dummy_list.append(' ')
    
        elif splitted_sent[i].isdigit(): # if it is a number
            if splitted_sent[i+1].isalpha() or (not splitted_sent[i+1].isalnum()):
                dummy_list.append(' ')
        
        elif (not splitted_sent[i].isalnum()) and (splitted_sent[i] not in [' ','\\']): # if it is a special char but not ' ' already
            if splitted_sent[i+1].isalnum():
                dummy_list.append(' ')
        
    dummy_list.append(splitted_sent[-1])
  
    return ''.join(dummy_list)


def preprocess(a):
    # convert the characters into lower case
    a = a.lower()

    # remomve newline character
    a = re.sub("\\n", " ", a)

    # remove the pattern [ whatever here ]. Use { } or  ( ) in place of [ ] in regex
    a = re.sub(r"\[(.*?)\]",' ',a)

    # remove Questions beginners Q5. 5. question 5. 
    a = re.sub(r"^[\w]+(\s|\.)(\s|\d+(\.*(\d+|\s)))\s*", " ", a)

    # remove MathPix markdown starting from \( and ending at \) while preserving data inside \text { preserve this }
    a = re.sub(r'\s*\\+\((.*?)\\+\)', lambda x: " ".join(re.findall(r'\\[a-z]{3,}\s*{([^{}]*)}', x.group(1))), repr(a))

    # remove options from questions i.e character bounded by () given there is no spacing inside ()
    a = re.sub(r"\s*\([^)\s]*\)\s*", " ", a)

    # remove any repeating special character (more than one times) except \(){}[] and space.  So it'll remove .. ,, ___ +++ etc
    a = re.sub(r"([^a-zA-Z0-9\\ (){}\]\[])\1{1,}",' ',a)

    # remove data inside {} -> at max 2 characters {q.}, {5.}
    a = re.sub(r"{.{0,2}}", " ", a)

    # Insert spaces among spec chars, digits and nums  and then remove every single len alphabet and number
    a = remove_single_word_num(insert_spaces(a))

    # remove whatever comes after \\ double slashes except space 
    a = re.sub(r"(\\[^ ]+)",' ',a)

    #remove every special characcter
    a = re.sub(r'(\W)|([_])',' ',a)

    # remomve newline character
    a = re.sub("\\n", " ", a)

    # remove repeated space if there is any
    a = re.sub(r"\s+", " ", a)
  
    return a


def remove_stopword(x):
    stopwords_new = stopwords.words('english')
    return [y for y in x if y not in stopwords_new]

def count_special_characters(df, col):
    
    pass

def common_tokens(data, col, top_most=50, title=None, return_temp=False, is_top=True):
     
    top = Counter([item for sublist in data[col] for item in sublist])
    if not is_top:
        temp = pd.DataFrame(top.most_common()[:-top_most:-1])
    else:
        temp = pd.DataFrame(top.most_common(top_most))
    temp.columns = ['Common_words','count']
    display(temp.style.background_gradient(cmap='Blues'))
    
    def plot_barchart(title=None):
        fig = px.bar(temp, x="count", y="Common_words", title=title, orientation='h', 
                 width=700, height=700, color='Common_words')
        fig.show()
        pass
    
    # plot_barchart(title)
    if return_temp:
        return temp
    
    del temp, top
    gc.collect()
    
    pass

def plot_wordcloud(text, mask=None, max_words=250, max_font_size=100, figure_size=(24.0,16.0), color = 'black',
                   title = None, title_size=40, image_color=False):
    stopwords = set(STOPWORDS)

    wordcloud = WordCloud(background_color=color,
                    stopwords = stopwords,
                    max_words = max_words,
                    max_font_size = max_font_size, 
                    random_state = 42,
                    width=400, 
                    height=200)
    wordcloud.generate(str(text))
    
    plt.figure(figsize=figure_size)
    if image_color:
        image_colors = ImageColorGenerator(mask);
        plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear");
        plt.title(title, fontdict={'size': title_size,  
                                  'verticalalignment': 'bottom'})
    else:
        plt.imshow(wordcloud);
        plt.title(title, fontdict={'size': title_size, 'color': 'black', 
                                  'verticalalignment': 'bottom'})
    plt.axis('off');
    plt.tight_layout()  
    pass

Let's start with reading the data.

In [None]:
df = pd.read_csv("../input/iitjee-neet-aims-students-questions-data/subjects-questions.csv")
df = df[df["Subject"] != "English"].reset_index(drop=True)
df

# Check for Null values

In [None]:
df.isna().sum().to_frame().rename(columns={0:"NaN_Count"}).style.background_gradient(cmap="Wistia")

As we can see, there are 3 nan values `eng` column which is supposed to be the question description itself. Hence let's remove those 3 samples and move ahead.

In [None]:
df.dropna(subset=["eng"], inplace=True)
df.reset_index(inplace=True, drop=True)
df.isna().sum().to_frame().rename(columns={0:"NaN_Count"}).style.background_gradient(cmap="Wistia")

Our target is to classify whether the description belongs to one of the categories `Physics`, `Maths`, `Chemistry` or `Bio`. Let's visualise the distribution of our target.

# Number of samples per Subject

In [None]:
target_Count = df['Subject'].value_counts().to_frame()
target_Count.style.background_gradient(cmap="BrBG")

As far as this much of data is concerned, this dataset is pretty imbalanced with very less samples in 2 classes i.e. `English` and `Bio`. and compratively more samples in another 3 classes, i.e `Physics`, `Chemistry` and `Math`. So, students face more difficulties in `Physics`, then `Chemistry` and then `Math` respectively, as they are seeking for help in these subjects. Let's have a pie chart for better visualization.

In [None]:
target_Count.reset_index(inplace=True)
target_Count.columns = ["Subject", "pct"]
target_Count.loc[:, "pct"] /= len(df)


plt.figure(figsize=(6, 10))
wegdes, texts, autotexts = plt.pie(target_Count['pct'],
                                  autopct=lambda pct: func(pct, target_Count['pct']),
                                  explode=(0.05, 0.05, 0.05, 0.05),
                                  labels=target_Count["Subject"],
                                  shadow=True,
                                  startangle=45,
                                  wedgeprops={"linewidth":1, "edgecolor":"black"},
                                  textprops= dict(color="black"))
plt.legend(wegdes, target_Count["Subject"],
          title="Subjects",
          loc="center",
          bbox_to_anchor=(1, 0, 0, 0))
plt.setp(autotexts, size=14, weight="bold")
plt.title("Subject Percentage distribution")
plt.show()

In [None]:
# free up space
del target_Count
gc.collect()

The pie chart shows $96.5\%$ doubts are from `PCM` only, with toughest among them being `Physics`. Now, let's analyse our text data. i.e `eng` column for better understading of our data.

# Length of the Questions/ Description

In [None]:
# let's start with the length of the description.
df["length_eng"] = df['eng'].apply(lambda x: len(x.split()))
sns.distplot(df["length_eng"], color="red", bins=25)
plt.title("Length of the description")
plt.show()

Oukaie, we can say that the length data is left skewed. Before going more statistical, let's look for outliers.

Let's see if there lies any outliers by plotting the box plot.

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(16, 10))
sns.boxplot(x="Subject", y="length_eng", data=df, ax=ax1)
sns.violinplot(x="Subject", y="length_eng", data=df, ax=ax2)
plt.show()

There are outliers, too much outliers in our length column. Well, the takeaways from these plots would be:

- `Physics` questions are lengthier and `Biology` questions are shorter.
- `Biology` seems to have lesser number of outliers which indicates, most of the biology question's length fall near their median value, only a few outlies. Variance is less.
- `Physics` seems to have highest variance with lowest median.
- `Chemistry` though has the longest description, yet the not so large variance indicates it has quite a significant number of samples roaming arond the median value. And same goes for `Math`, with a lower *longest Description* value.

# Statistical Analysis of Length

In [None]:
pct_25 = lambda x: np.percentile(x, 25)
pct_75 = lambda x: np.percentile(x, 75)
pct_75.__name__ = "75%"
pct_25.__name__ = "25%"

In [None]:
df.pivot_table("length_eng", "Subject", aggfunc=["count", "min", pct_25, "mean", "median", pct_75, "max", "std", "var"]).style.background_gradient(cmap="plasma")

*Note*: Above analysis are obtained provided no preprocessing is applied yet.

<h1 style="color:red">If you find this notebook interesting and well written, don't forget to upvote :)</h1>

Well, here comes anoter crucial obeservation I think we should drop before movind ahead. Let me show.

# Analysing most common words in the dataset and per subject

In [None]:
display(df)

In [None]:
for _ in range(5):
    print(df.loc[np.random.randint(0, len(df)), "eng"])

As we can see, most of the questions contain options such as `(a)`, `(b)`, `(i)` etc. Ofcourse, we will find such options in every subject. As our aim to build a classifier to classify that a given question or description belongs to a particular Subject, keeping these tokens will only bring redudant similarity, so let's remove them and apply some basic preprocessing to move ahead with our exploration.

Let's prepare our preprocessing function and see the effects.   
*NOTE*: The functions can be found under the Helper functions section :)

In [None]:
for _ in range(25):
    print("\n")
    i = np.random.randint(0, len(df))
    x, y = df.loc[i, "eng"], df.loc[np.random.randint(0, len(df)), "Subject"]
    print(x)
    print()
    print(y, "\t", i, "\n")
    print(preprocess(x))

Seems like there are too many stop words, but we will see, because assuming stopwords based on `English` literature might not work in our case.

# Comparison with After and Before Processing

In [None]:
df["processed_eng"] = df["eng"].apply(lambda x: preprocess(x))
display(df.head())

### Without removing stopwords

#### Most common words without any preprocessing

In [None]:
df["token_list"] = df["eng"].apply(lambda x: x.split())
common_tokens(df, col="token_list", top_most=2000)  # experiment with stopwords(removing/not removing)

We can see the unexpected tokens as we haven't applied preprocessing yet. Let's apply our preprocessing function and look for the most common words.

#### Least common words without any preprocessing

In [None]:
common_tokens(df, col="token_list", top_most=2000, is_top=False)

#### Most common words with Preprocessing

In [None]:
df["token_list_processed"] = df["processed_eng"].apply(lambda x: x.split())
most_2000 = common_tokens(df, col="token_list_processed", top_most=2000, return_temp=True)

#### Least common words with Preprocessing

In [None]:
least_2000 = common_tokens(df, col="token_list_processed", top_most=2000, is_top=False, return_temp=True)

### With Stopwords removal

In [None]:
df["token_list"] = df["token_list"].apply(lambda x: remove_stopword(x))
df["token_list_processed"] = df["token_list_processed"].apply(lambda x: remove_stopword(x))

#### Most common words without any preprocessing

In [None]:
common_tokens(df, col="token_list", top_most=2000)

#### Least common words without any preprocessing

In [None]:
common_tokens(df, col="token_list", top_most=2000, is_top=False)

#### Most common words with preprocessing

In [None]:
most_2000_sr = common_tokens(df, col="token_list_processed", top_most=2000, return_temp=True)

#### Least Common words with preprocessing

In [None]:
least_2000_sr = common_tokens(df, col="token_list_processed", top_most=2000, return_temp=True, is_top=False)

Well, it seems pretty clearer now.

# Category wise Data

In [None]:
Phy = df[df['Subject']=='Physics']
Math = df[df['Subject']=='Maths']
Chem = df[df['Subject']=='Chemistry']
Bio = df[df['Subject']=='Biology']

In [None]:
Phy.name = "Phy"
Chem.name = "Chem"
Bio.name = "Bio"
Math.name = "Math"

#### Most Common Words in Physics

In [None]:
top_phy = common_tokens(Phy, col="token_list_processed", top_most=2000, return_temp=True)

#### Least common words in Physics

In [None]:
least_Phy = common_tokens(Phy, col="token_list_processed", top_most=2000, return_temp=True, is_top=False)

#### Most common words in Chemistry

In [None]:
top_chem = common_tokens(Chem, col="token_list_processed", top_most=2000, return_temp=True)

#### Least common words in Chemistry

In [None]:
least_Chem = common_tokens(Chem, col="token_list_processed", top_most=2000, return_temp=True, is_top=False)

#### Most common words in Biology

In [None]:
top_bio = common_tokens(Bio, col="token_list_processed", top_most=2000, return_temp=True)

#### Least common words in Biology

In [None]:
least_Bio = common_tokens(Bio, col="token_list_processed", top_most=2000, return_temp=True, is_top=False)

#### Most common words in Maths

In [None]:
top_math = common_tokens(Math, col="token_list_processed", top_most=2000, return_temp=True)

#### Least common words in Maths

In [None]:
least_math = common_tokens(Math, col="token_list_processed", top_most=2000, return_temp=True, is_top=False)

What can be our takeaways from here?

- After stop words removal, still we can notice some irrelevant data being appeared as most common words.
- In most of the cases, these are (b), (a_number), Q., etc. These can be assumed as the option number of the questions. (The same has been discussed with Vedant, and the suggestions on removing such instances are implemented.)

*NOTE: Theses updates have been made in this version. I have removed all such ocurrences those might hamper the model peformance and cause unnecessary similarity between target classes.*

# Most common words across classes

Let's check out what will happen if we analyse inter-category questions.

In [None]:
# we have dataframes for each subject as well as on the whole set
common_words = pd.concat([most_2000.drop("count", axis=1).rename(columns={"Common_words": "most_common"}),
           most_2000_sr.drop("count", axis=1).rename(columns={"Common_words": "most_common_sr"}),
           least_2000.drop("count", axis=1).rename(columns={"Common_words": "least_common"}),
           least_2000_sr.drop("count", axis=1).rename(columns={"Common_words": "least_common_sr"}),
           top_phy.drop("count", axis=1).rename(columns={"Common_words": "most_phy"}),
           least_Phy.drop("count", axis=1).rename(columns={"Common_words": "least_phy"}),
           top_chem.drop("count", axis=1).rename(columns={"Common_words": "most_chem"}),
           least_Chem.drop("count", axis=1).rename(columns={"Common_words": "least_chem"}),
           top_bio.drop("count", axis=1).rename(columns={"Common_words": "most_bio"}),
           least_Bio.drop("count", axis=1).rename(columns={"Common_words": "least_bio"}),
           top_math.drop("count", axis=1).rename(columns={"Common_words": "most_math"}),
           least_math.drop("count", axis=1).rename(columns={"Common_words": "least_math"})], axis=1)
display(common_words.head())
common_words.to_csv("common-words-analysis.csv", index=False)

# Analysing Special characters

In [None]:
special_dict = Counter([item for sublist in df["eng"] for item in sublist if not item.isalnum()])
special_dict = pd.DataFrame(special_dict.most_common())
special_dict_pp = Counter([item for sublist in df["processed_eng"] for item in sublist if not item.isalnum()])
special_dict_pp = pd.DataFrame(special_dict_pp.most_common())

In [None]:
special_dict.columns = ["Special_tokens", "count"]
special_dict_pp.columns = ["Special_tokens", "count"]

In [None]:
for ddf in [Phy, Chem, Bio, Math]:
    
    special_ = Counter([item for sublist in ddf["eng"] for item in sublist if not item.isalnum()])
    special_ = pd.DataFrame(special_.most_common())
    special_.columns = ["Special_tokens", ddf.name]

    special_dict = pd.merge(special_dict, special_, how="left", on="Special_tokens")
    special_dict[ddf.name] = special_dict[ddf.name].fillna(0).astype("int64")
    
special_dict

In [None]:
display(special_dict.style.background_gradient(cmap="twilight_shifted"))

In [None]:
special_dict.to_csv("special-characters-analysis.csv", index=False)

# Time for Wordcloud

In [None]:
plot_wordcloud(Phy.processed_eng, color='black', max_font_size=100, title_size=30, title="WordCloud of Physics")

In [None]:
plot_wordcloud(Chem.processed_eng, color='black', max_font_size=100, title_size=30, title="WordCloud of Chemistry")

In [None]:
plot_wordcloud(Math.processed_eng, color='black', max_font_size=100, title_size=30, title="WordCloud of Maths")

In [None]:
plot_wordcloud(Bio.processed_eng, color='black', max_font_size=100, title_size=30, title="WordCloud of Biology")