In [None]:
import pandas as pd

# Read the Excel file into a pandas DataFrame
df = pd.read_excel('/content/english_data_test2.xlsx')

# Display the first 5 rows of the DataFrame
display(df.head())

# Display the information about the DataFrame
display(df.info())

Unnamed: 0,Question,Option A,Option B,Option C,Option D,Correct Answer Letter,Difficulty_Level
0,She ___ to school every day.,go,goes,going,gone,B,4
1,Which is a plural noun?,book,pen,cats,water,C,4
2,"is the opposite of ""big""?",small,tall,large,huge,A,4
3,I ___ an apple.,eat,eats,eating,ate,A,3
4,The sun ___ in the east.,rise,rises,rose,rising,B,4


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1876 entries, 0 to 1875
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Question               1876 non-null   object
 1   Option A               1876 non-null   object
 2   Option B               1876 non-null   object
 3   Option C               1876 non-null   object
 4   Option D               1876 non-null   object
 5   Correct Answer Letter  1872 non-null   object
 6   Difficulty_Level       1876 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 102.7+ KB


None

In [None]:
import re
import string

# Identify relevant columns
text_columns = ['Question', 'Option A', 'Option B', 'Option C', 'Option D']
correct_answer_column = 'Correct Answer Letter'
difficulty_column = 'Difficulty_Level'

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        return text
    return text

# Apply cleaning to text columns
for col in text_columns:
    df[col] = df[col].apply(clean_text)

# Handle missing values in relevant columns
# For text columns, we can fill with an empty string or a placeholder if needed,
# but given the nature of the data, dropping rows with missing questions or correct answers might be more appropriate.
# Let's check for missing values in the core columns.
print("Missing values before handling:")
print(df[[*text_columns, correct_answer_column, difficulty_column]].isnull().sum())

# Drop rows where 'Question' or 'Correct Answer Letter' is missing
df.dropna(subset=['Question', 'Correct Answer Letter'], inplace=True)

# For option columns, missing values might indicate fewer options, which is fine.
# We can fill them with an empty string after cleaning to ensure consistency.
for col in text_columns:
    df[col].fillna('', inplace=True)


print("\nMissing values after handling:")
print(df[[*text_columns, correct_answer_column, difficulty_column]].isnull().sum())


# Display information about the processed DataFrame
display(df.info())
display(df.head())

Missing values before handling:
Question                 0
Option A                 0
Option B                 0
Option C                 0
Option D                 0
Correct Answer Letter    4
Difficulty_Level         0
dtype: int64

Missing values after handling:
Question                 0
Option A                 0
Option B                 0
Option C                 0
Option D                 0
Correct Answer Letter    0
Difficulty_Level         0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 1872 entries, 0 to 1875
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Question               1872 non-null   object
 1   Option A               1872 non-null   object
 2   Option B               1872 non-null   object
 3   Option C               1872 non-null   object
 4   Option D               1872 non-null   object
 5   Correct Answer Letter  1872 non-null   object
 6   Difficulty_Level  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('', inplace=True)


None

Unnamed: 0,Question,Option A,Option B,Option C,Option D,Correct Answer Letter,Difficulty_Level
0,she to school every day,go,goes,going,gone,B,4
1,which is a plural noun,book,pen,cats,water,C,4
2,is the opposite of big,small,tall,large,huge,A,4
3,i an apple,eat,eats,eating,ate,A,3
4,the sun in the east,rise,rises,rose,rising,B,4


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Explicitly convert relevant columns to string type to avoid TypeError
text_columns = ['Question', 'Option A', 'Option B', 'Option C', 'Option D']
for col in text_columns:
    df[col] = df[col].astype(str)

# 1. Combine the text from the 'Question' column and all 'Option' columns
df['combined_text'] = df['Question'].fillna('') + ' ' + \
                        df['Option A'].fillna('') + ' ' + \
                        df['Option B'].fillna('') + ' ' + \
                        df['Option C'].fillna('') + ' ' + \
                        df['Option D'].fillna('')

# 2. Initialize a TF-IDF Vectorizer
# Using a limited number of features to manage dimensionality
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# 3. Fit the TF-IDF Vectorizer to the combined text data and transform the text
X = tfidf_vectorizer.fit_transform(df['combined_text'])

# 4. Store the resulting TF-IDF features in a new variable, for example, X.
# X is already the variable holding the TF-IDF features.

print("Shape of the TF-IDF feature matrix (X):", X.shape)

Shape of the TF-IDF feature matrix (X): (1872, 1649)


In [None]:
from sklearn.model_selection import train_test_split

# Define the target variable (Difficulty_Level)
y = df['Difficulty_Level']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting sets to verify the split
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1497, 1649)
Shape of X_test: (375, 1649)
Shape of y_train: (1497,)
Shape of y_test: (375,)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create an instance of the Random Forest Classifier model
model = RandomForestClassifier(random_state=42)

In [None]:
# Train the model using the training data
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1. Use the trained model to predict the difficulty level for the test data
y_pred = model.predict(X_test)

# 2. & 3. Calculate and print the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# 4. Calculate and print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# 5. Generate and print the classification report
class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)

Accuracy: 0.8613

Confusion Matrix:
[[18  0  0  2  0  0  0  0  0  0  0]
 [ 0 23  2  2  6  0  0  0  0  0  0]
 [ 0  0 32  2  0  0  0  2  0  0  0]
 [ 0  0  0 63  4  2  0  0  0  0  0]
 [ 0  0  0  2 67  0  0  0  0  0  0]
 [ 0  0  0  0  0 37  2  2  0  0  0]
 [ 0  0  0  2  0  4 25  0  0  0  0]
 [ 0  0  0  0  0  1  0 22  0  0  2]
 [ 0  0  0  0  0  2  0  0  9  0  0]
 [ 0  0  0  0  1  0  0  1  0  7  4]
 [ 0  0  0  0  4  0  0  3  0  0 20]]

Classification Report:
              precision    recall  f1-score   support

           1       1.00      0.90      0.95        20
           2       1.00      0.70      0.82        33
           3       0.94      0.89      0.91        36
           4       0.86      0.91      0.89        69
           5       0.82      0.97      0.89        69
           6       0.80      0.90      0.85        41
           7       0.93      0.81      0.86        31
           8       0.73      0.88      0.80        25
           9       1.00      0.82      0.90        11
  

In [None]:
# 1. Create new example questions
new_questions_data = {
  'Question': [
        'Choose the correct form of the verb: She ___ to the market every Sunday.',
        'Which sentence is in the passive voice?',
        'What is the synonym of "benevolent"?',
        'What is the opposite of "scarce"?',
        'Which of the following is a metaphor?',
        'Identify the alliteration:',
        'Read the sentence: "Although it was raining, they continued playing football." What does the word "Although" show?',
        'Which sentence is grammatically correct?',
        'Choose the correct pronoun: This is the man ___ car was stolen.',
        'Which word is an antonym of "expand"?'
    ],
    'Option A': [
        'go',
        'The teacher explained the lesson.',
        'Cruel',
        'Rare',
        'Her smile is as bright as the sun.',
        'Peter Piper picked a peck of pickled peppers.',
        'Cause',
        'He don’t like coffee.',
        'who',
        'enlarge'
    ],
    'Option B': [
        'goes',
        'The lesson was explained by the teacher.',
        'Kind',
        'Abundant',
        'Her smile is the sun.',
        'The stars are shining brightly tonight.',
        'Contrast',
        'She doesn’t likes tea.',
        'whose',
        'contract'
    ],
    'Option C': [
        'going',
        'The students listened carefully.',
        'Angry',
        'Limited',
        'She runs like the wind.',
        'She is as busy as a bee.',
        'Result',
        'They doesn’t play football.',
        'whom',
        'reduce'
    ],
    'Option D': [
        'gone',
        'The teacher is explaining the lesson.',
        'Selfish',
        'Few',
        'He was as tall as a tree.',
        'The book was very interesting.',
        'Time',
        'She doesn’t like tea.',
        'which',
        'minimize'
    ]
}
new_questions_df = pd.DataFrame(new_questions_data)

# Apply the same text cleaning as used on the training data
text_columns = ['Question', 'Option A', 'Option B', 'Option C', 'Option D']
for col in text_columns:
    new_questions_df[col] = new_questions_df[col].astype(str).apply(clean_text)
    new_questions_df[col].fillna('', inplace=True) # Fill any potential NaNs after cleaning

# 2. Combine the text from the 'Question' column and all 'Option' columns for new questions
new_questions_df['combined_text'] = new_questions_df['Question'].fillna('') + ' ' + \
                                     new_questions_df['Option A'].fillna('') + ' ' + \
                                     new_questions_df['Option B'].fillna('') + ' ' + \
                                     new_questions_df['Option C'].fillna('') + ' ' + \
                                     new_questions_df['Option D'].fillna('')


# Transform the combined text of new questions using the *fitted* TF-IDF Vectorizer
X_new = tfidf_vectorizer.transform(new_questions_df['combined_text'])

# 3. Use the trained model (using the best model from tuning if accuracy_tuned is better, otherwise the initial model)
# Based on previous results, the initial model had slightly better accuracy (0.8613 vs 0.8533)
# So, we will use the initial 'model' here. If accuracy_tuned was better, we would use 'best_model'.
final_model = model # or best_model if accuracy_tuned > accuracy

# Predict the difficulty level for the new questions
new_predictions = final_model.predict(X_new)

# 4. Display the new questions and their predicted difficulty levels
new_questions_df['Predicted Difficulty Level'] = new_predictions

print("New Questions and Predicted Difficulty Levels:")
display(new_questions_df[['Question', 'Predicted Difficulty Level']])

New Questions and Predicted Difficulty Levels:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_questions_df[col].fillna('', inplace=True) # Fill any potential NaNs after cleaning
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_questions_df[col].fillna('', inplace=True) # Fill any potential NaNs after cleaning
The behavior will change in pandas 3.0. This inplace met

Unnamed: 0,Question,Predicted Difficulty Level
0,choose the correct form of the verb she to th...,6
1,which sentence is in the passive voice,6
2,what is the synonym of benevolent,5
3,what is the opposite of scarce,5
4,which of the following is a metaphor,4
5,identify the alliteration,11
6,read the sentence although it was raining they...,4
7,which sentence is grammatically correct,4
8,choose the correct pronoun this is the man ca...,9
9,which word is an antonym of expand,8


In [None]:
import joblib

joblib.dump(model, "model.pkl")


joblib.dump(tfidf_vectorizer, "vectorizer.pkl")


['vectorizer.pkl']