In [198]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
sns.set(style="whitegrid")


In [199]:
df = pd.read_csv("/Users/sanskarranjan/Downloads/Dataset.csv")
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   school      649 non-null    object 
 1   sex         649 non-null    object 
 2   address     649 non-null    object 
 3   famsize     599 non-null    object 
 4   Pstatus     649 non-null    object 
 5   Medu        649 non-null    int64  
 6   Fedu        576 non-null    float64
 7   Mjob        649 non-null    object 
 8   Fjob        649 non-null    object 
 9   reason      649 non-null    object 
 10  guardian    649 non-null    object 
 11  traveltime  576 non-null    float64
 12  failures    649 non-null    int64  
 13  schoolsup   649 non-null    object 
 14  famsup      649 non-null    object 
 15  paid        649 non-null    object 
 16  activities  649 non-null    object 
 17  nursery     649 non-null    object 
 18  higher      573 non-null    object 
 19  internet    649 non-null    o

In [None]:
plt.figure(figsize=(18, 5))
for i, feature in enumerate(['Feature_1', 'Feature_2', 'Feature_3'], 1):
    plt.subplot(1, 3, i)  
    sns.histplot(df[feature], bins=20, kde=True) 
    plt.title(f'Dist of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Freq')

plt.tight_layout()
plt.show()

In [None]:
corr_matrix = df.corr(numeric_only=True)
plt.figure(figsize=(14, 12))
sns.heatmap(
    corr_matrix,
    annot=True,        
    fmt=".2f",        
    cmap="seismic",  
    square=True,
    linewidths=0.3,    
    cbar_kws={'shrink': .8}  
)
plt.title("Heatmap of Features")
plt.show()

In [None]:
missing = df.isnull().sum()
missing_feats = missing[missing > 0]
print("Features with missing values:", missing_feats)

In [None]:
df_cleaned = df.copy()
df_cleaned.columns=["school","sex","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","guardian","traveltime","failures","schoolsup","famsup","paid","activities","nursery","higher","internet","famrel","freetime","goout","Dalc","health","absences","G1","G2","G3","age","studytime","extrovertednes","romantic"]
df_cleaned['famsize'].fillna(df_cleaned['famsize'].mode()[0], inplace=True)

df_cleaned['Fedu'].fillna(df_cleaned['Fedu'].median(), inplace=True)
df_cleaned['traveltime'].fillna(df_cleaned['traveltime'].median(), inplace=True)
df_cleaned['freetime'].fillna(df_cleaned['freetime'].median(), inplace=True)
df_cleaned['absences'].fillna(df_cleaned['absences'].median(), inplace=True)

df_cleaned['higher'].fillna(df_cleaned['higher'].mode()[0], inplace=True)
df_cleaned['G2'] = df_cleaned.apply(
    lambda row: (row['G1'] + row['G3']) / 2 
    if pd.isnull(row['G2']) and not (np.isnan(row['G1']) or np.isnan(row['G3'])) 
    else row['G2'],
    axis=1
)
df_cleaned['G2'].fillna(df_cleaned['G2'].median(), inplace=True)
for feature in ['age', 'studytime', 'extrovertednes']:
    df_cleaned[feature].fillna(df_cleaned[feature].median(), inplace=True)
    
df_cleaned = df_cleaned.dropna(subset=['romantic'])


In [None]:
missing_ = df_cleaned.isnull().sum()
missing_feat = missing_[missing_ > 0]

if not missing_feat.empty:
    print("Features with missing values:", missing_feat)
else:
    print("No features have missing values.")

LEVEL3

level4

logistic regression

In [None]:

df_cleaned['romantic'] = df_cleaned['romantic'].map({'yes': 1, 'no': 0})


categcols = df_cleaned.select_dtypes(include='object').columns
label_encoders = {}
for col in categcols:
    le = LabelEncoder()
    df_cleaned[col] = le.fit_transform(df_cleaned[col])
    label_encoders[col] = le

# Features and target
X = df_cleaned.drop(columns=['romantic'])
y = df_cleaned['romantic']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Train model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test_scaled)
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred))
print("Confusion Matrix:", confusion_matrix(y_test, y_pred))

# Feature importance
importance = pd.Series(model.coef_[0], index=X.columns)
print("Top features influencing romantic relationships:")
print(importance.sort_values(ascending=False).head(10))


# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test_scaled)
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred))
print("Confusion Matrix:", confusion_matrix(y_test, y_pred))



Random forest

In [None]:
# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test_scaled)
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred))
print("Confusion Matrix:", confusion_matrix(y_test, y_pred))


level5


In [None]:
# Convert scaled X_train and X_test back to DataFrames for SHAP
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns)

# Initialize SHAP explainer for tree-based model
explainer = shap.TreeExplainer(model)

# Calculate SHAP values for test set
shap_values = explainer.shap_values(X_test_scaled_df)

# SHAP summary plot for class 1 (romantic = 1)
shap.summary_plot(shap_values[1], X_test_scaled_df, plot_type="bar")
shap.summary_plot(shap_values[1], X_test_scaled_df)