## Objective

Comparative study of KNN and Decision tree for predicting Students Adaptability level in Online Education.

### Import libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

### Read Data

In [None]:
df = pd.read_csv("Quaid Khalid - students_adaptability_level_online_education.csv")
df.head()
df.columns

In [None]:
df.columns.str.strip()

In [None]:
df.info()

### Generate Data Profiling Report

In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df)
profile.to_notebook_iframe()

In [None]:
df.duplicated().sum()

In [None]:
df["new_index"] = range(0,1205)
df.head(2)

In [None]:
df.duplicated().sum()

In [None]:
df.info()

# Encoding

Choose a suitable encoding technique for the columns.

In [None]:
columns = df[["Gender","Institution Type","IT Student","Location","Load-shedding","Internet Type","Self Lms","Network Type","Device"]]
columns

In [None]:
from sklearn.preprocessing import OneHotEncoder

df = pd.get_dummies(df, columns =["Gender","Institution Type","IT Student","Location","Load-shedding","Internet Type","Self Lms","Network Type","Device"], dtype=int) 
df.head(2)

In [None]:
df["Age"].value_counts()

In [None]:
df["Education Level"].value_counts()

In [None]:
df["Financial Condition"].value_counts()

In [None]:

df["Class Duration"].value_counts()

In [None]:
Age_encoding = {'1-5': 1, '6-10': 2, '11-15': 3, '16-20': 4, '21-25': 5, '26-30': 6}
Education_Level_encoding = {'School': 1, 'College': 2, 'University': 3}
Financial_Condition_encoding = {'Poor': 1, 'Mid': 2, 'Rich': 3}
Class_Duration_encoding = {'0': 1, '1-3': 2, '3-6': 3}

df['Age_encoding'] = df['Age'].map(Age_encoding)
df['Education Level_encoding'] = df['Education Level'].map(Education_Level_encoding)
df['Financial Condition_encoding'] = df['Financial Condition'].map(Financial_Condition_encoding)
df['Class Duration_encoding'] = df['Class Duration'].map(Class_Duration_encoding)
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['encoded_Adaptivity Level'] = label_encoder.fit_transform(df['Adaptivity Level'])
df.head()

In [None]:
df.drop(columns=['Age','Education Level','Financial Condition','Class Duration','Adaptivity Level'],inplace=True)
df.head()

In [None]:
# df.info()
df["encoded_Adaptivity Level"].value_counts()

### Check distribution

In [None]:

columns = ['Gender', 'Age', 'Education Level', 'Institution Type', 'IT Student',
       'Location', 'Load-shedding', 'Financial Condition', 'Internet Type',
       'Network Type', 'Class Duration', 'Self Lms', 'Device',
       'Adaptivity Level']


num_columns = len(columns)  # Calculate the number of rows needed for subplots
num_rows = (num_columns + 2) // 3  # This will ensure that there are 3 columns per row


plt.figure(figsize=(15, num_rows*4))


for i, column in enumerate(columns):
    plt.subplot(num_rows, 3, i+1)  
    sns.histplot(df[column], kde=False)
    plt.title(f'Distribution of {column}')
    plt.xticks(rotation=45)


plt.tight_layout()
plt.show()


# Machine Learning Algorithms

Use KNN and Decision tree and find which one is working better.

### Define X and Y

In [None]:
X = df.drop('encoded_Adaptivity Level', axis=1)
y = df['encoded_Adaptivity Level']

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
# Normalize the features

min_max_scaler = MinMaxScaler()

In [None]:
min_max_scaler.fit(X)

In [None]:
scaled_features = min_max_scaler.transform(X)

In [None]:
scaled_features

In [None]:
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])
df_feat.head()

### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features,  y,  test_size=0.30)

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)

In [None]:
knn.fit(X_train,y_train)

In [None]:
pred = knn.predict(X_test)

### Predictions and Evaluations
    Let's evaluate our KNN model!

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(confusion_matrix(y_test,pred))

In [None]:
print(classification_report(y_test,pred))

In [None]:
#Training Accuracy
print(knn.score(X_train, y_train))

In [None]:
#Testing Accuracy
print(knn.score(X_test, y_test))

# Decision Tree

**Using Entropy**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
# Initializing and training the Decision Tree Classifier with Information Gain (Entropy)
dt_entropy = DecisionTreeClassifier(criterion='entropy', random_state=42)
dt_entropy.fit(X_train, y_train)

In [None]:
# Making predictions and evaluating the models
y_pred_entropy = dt_entropy.predict(X_test)

accuracy_entropy = accuracy_score(y_test, y_pred_entropy)

accuracy_entropy

**Use Gini Index**

In [None]:
# Initializing and training the Decision Tree Classifier with Gini
dt_gini = DecisionTreeClassifier(criterion='gini', random_state=42)
dt_gini.fit(X_train, y_train)

In [None]:
# Making predictions and evaluating the models
y_pred_gini = dt_gini.predict(X_test)

accuracy_gini = accuracy_score(y_test, y_pred_gini)

accuracy_gini

In [None]:
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import pandas as pd

# Train a Decision Tree Model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X, y)

# Get Feature Importances
importances = dt_model.feature_importances_

# Convert to a DataFrame
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': importances})

# Sort the DataFrame by importance
feature_importances = feature_importances.sort_values(by='importance', ascending=False)

# Visualize Feature Importances
plt.figure(figsize=(12, 6))
plt.bar(feature_importances['feature'], feature_importances['importance'])
plt.xlabel('Features')
plt.ylabel('Importance')
plt.xticks(rotation=90)
plt.title('Feature Importances')
plt.show()

# All the Best!