# Incorrect Answers Data Analysis

##**Overview**

Find attached a JSON file containing data about some quiz questions of a hypothetical EdTech startup. Each question has text and a percent correct value, which is the percent of students who have answered that question correctly. The file should be encoded in UTF-8.

## What words or phrases appear more frequently in questions that students tend to do poorly on?

In [None]:
# Import google drive

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Import NLTK Libraries
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
import re

**Overview**

Find attached a JSON file containing data about some quiz questions of a hypothetical EdTech startup. Each question has text and a percent correct value, which is the percent of students who have answered that question correctly. The file should be encoded in UTF-8.

## Read the Data

In [None]:
# Convert the json data to a dataframe

import pandas as pd

df = pd.read_json('/content/drive/MyDrive/Text Analytics/Mini_project_1/quiz_question_data.json')

df.to_string()

df.head()

## Data description

In [None]:
df.info()

In [None]:
df.isnull( ).sum()

In [None]:
df.describe()

##Map the values less than 0.5 as incorrect and greater than 0.5 as correct

In [None]:
df['is_correct'] = df['percent_correct'].map(lambda x: "Correct" if x > 0.5 else "Incorrect")

df

## Flag the incorrect values as 0 and correct values as 1

In [None]:
df['flag'] = df['percent_correct'].map(lambda x: 1 if x > 0.5 else 0)

df

## Distribution of correct and incorrect answers

In [None]:
df_pct = df["is_correct"].value_counts().to_frame("Counts").reset_index()
df_pct

In [None]:
df1 = df['is_correct'].value_counts().to_frame("Counts").plot(kind='pie', subplots=True, autopct='%1.1f%%')
plt.title('Percentage of students that have answered the questions correctly')
plt.show()

## Word frequency in Incorrect dataset

In [None]:
# Supress warnings
import warnings
warnings.filterwarnings("ignore")

## Filter the Incorrect dataset

In [None]:
# What words or phrases appear more frequently in questions that students tend to do poorly on?

df2 = df.copy()

# Create the incorrect dataset
df_incorrect = df2[df2['flag'] == 0]
df_incorrect

## Data cleaning

In [None]:
# Function to clean and preprocess text
def clean_text(text):

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])


    # Tokenize text
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))

    tokens = [word for word in tokens if word not in stop_words and not word.isdigit()]

    #Stemming (you can replace with lemmatization if preferred)
    #stemmer = PorterStemmer()

    #tokens = [stemmer.stem(word) for word in tokens]

    # Create a lemmatizer object.
    lemmatizer = WordNetLemmatizer()

   #Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Reconstruct cleaned text
    cleaned_text = ' '.join(tokens)

    return cleaned_text


In [None]:
# Apply the clean_text function to each review in the DataFrame
df2 = df1.copy()

df_incorrect['Clean_text'] = df_incorrect['text'].apply(clean_text)

# Print the cleaned reviews
df_incorrect

# Approach 1: Visualize the Word Cloud

In [None]:
# Generate a word cloud for df_incorrect["Clean_text"]

from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Download the nltk stopwords if you haven't done so
nltk.download('stopwords')

# Create a set of stopwords
stop_words = set(stopwords.words('english'))

# Split the text into words and remove duplicates
word_list = str(df_incorrect["Tokenized_Text"]).split()
unique_words =set(word_list)

# Create a new text string with unique words
unique_text = " ".join(unique_words)

# Generate a word cloud with stop words
wordcloud = WordCloud(width=800, height=400, stopwords=stop_words, background_color='white').generate(unique_text)

# plot the graph
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# Approach 2: Visualize Word frequency distribution of the Unigrams, Bigrams and Trigrams using NLTK

## Unigrams Word Frequency Distribution

In [None]:
import collections
from collections import Counter
from itertools import chain

word_tokenize = nltk.word_tokenize

# Tokenize the text column
df_incorrect['Tokenized_Text'] = df_incorrect['Clean_text'].apply(word_tokenize)

# Print the tokenized text
corpus = df_incorrect['Tokenized_Text']
corpus = corpus.tolist()
# Flatten list of lists to a single list
tokens = list(chain(*corpus))
unique_freq = collections.Counter(tokens)
# Count each unique element
unique_freq_df = pd.DataFrame.from_dict(unique_freq, orient='index').reset_index() # Convert to dataframe
# Rename columns
unique_freq_df = unique_freq_df.rename(columns={'index': 'Token', 0: 'Count'})
# Sort by count
unique_freq_df.sort_values('Count', ascending=False, inplace=True)
unique_freq_df = unique_freq_df.head(20)

unique_freq_df1 = unique_freq_df.reset_index(drop=True)
unique_freq_df2 = unique_freq_df1.set_index("Token")
unique_freq_df2

## Plot the Top 20 Unigrams

In [None]:
#plt.colormaps()
unique_freq_df2.plot(kind="bar", figsize= (18,5), grid=False, color = "pink")
plt.title("Word Frequency of Top 20 Unigrams in Incorrect Data", size = 15)
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.show()

# Visualizing Bigrams Frequency Distribution

In [None]:
# Generate bigrams from df_incorrect["Clean_text"]

bigram_list = [list(nltk.bigrams(text.split())) for text in df_incorrect['Clean_text']]

# Create a Counter object to count the frequency of each bigram
bigram_count = collections.Counter(list(chain(*bigram_list)))

# Convert the Counter object to a DataFrame
bigram_df = pd.DataFrame.from_dict(bigram_count, orient='index').reset_index()

# Rename the columns
bigram_df = bigram_df.rename(columns={'index': 'Bigram', 0: 'Count'})

# Sort the DataFrame by frequency in descending order
bigram_df.sort_values('Count', ascending=False, inplace=True)

# Print the top 20 bigrams
bigram_df.head(20)

In [None]:
bigram20 = bigram_df.head(20)

# Plot the bigram_df
bigram20.plot(x="Bigram", y="Count", kind="bar", figsize= (18,5), grid=False, color = "pink")
plt.title("Bigram Frequency of Top 20 Bigrams in Incorrect Data")
plt.xlabel("Bigrams")
plt.ylabel("Frequency")
plt.show()

## Visualizing trigram frequency distribution

In [None]:

trigram_list = [list(nltk.trigrams(text.split())) for text in df_incorrect['Clean_text']]

# Create a Counter object to count the frequency of each bigram
trigram_count = collections.Counter(list(chain(*trigram_list)))

# Convert the Counter object to a DataFrame
trigram_df = pd.DataFrame.from_dict(trigram_count, orient='index').reset_index()

# Rename the columns
trigram_df = trigram_df.rename(columns={'index': 'Trigram', 0: 'Count'})

# Sort the DataFrame by frequency in descending order
trigram_df.sort_values('Count', ascending=False, inplace=True)

# Print the top 20 bigrams
trigram_df.head(20)

In [None]:
trigram20 = trigram_df.head(20)

# Plot the bigram_df
trigram20.plot(x="Trigram", y="Count", kind="bar", figsize= (18,5), grid=False, color = "pink")
plt.title("Trigram Frequency of Top 20 Trigrams in Incorrect Data", size = 15)
plt.xlabel("Trigrams")
plt.ylabel("Frequency")
plt.show()

# Approach 3: BERT Topic Modelling

In [None]:
# Install bertopic
!pip install bertopic

In [None]:
# Try to import BERTopic
from bertopic import BERTopic

In [None]:
# Data processing
import pandas as pd
import numpy as np

# Text preprocessiong
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()

# Topic model
from bertopic import BERTopic

# Dimension reduction
from umap import UMAP

In [None]:
# Initiate UMAP
umap_model = UMAP(n_neighbors=15,
                  n_components=5,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=100)

# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True)

# Run BERTopic model
topics, probabilities = topic_model.fit_transform(df_incorrect['Clean_text'])

In [None]:
# Get the list of topics
topic_model.get_topic_info()

In [None]:
# Get top 10 terms for a topic
topic_model.get_topic(1)

In [None]:
# Visualize top topic keywords
topic_model.visualize_barchart(top_n_topics=8)

In [None]:
# Visualize intertopic distance
topic_model.visualize_topics()

In [None]:
# Visualize similarity using heatmap
topic_model.visualize_heatmap()