In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, recall_score, precision_score,roc_auc_score, f1_score
from sklearn.naive_bayes import GaussianNB


# Task 1

In [2]:
data = pd.read_csv("C:/Users/ashi/Downloads/bank-additional-full.csv")
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [4]:
# Convert target variable to binary format
data['y'] = data['y'].map({'yes': 1, 'no': 0})

# Verify the changes
print(data['y'].value_counts())

y
0    36548
1     4640
Name: count, dtype: int64


One Hot Encoding

In [5]:
# Identify categorical columns manually (ensure they exist as text)
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 
                    'contact', 'month', 'day_of_week', 'poutcome']

# Convert categorical columns to string format to ensure proper encoding
data[categorical_cols] = data[categorical_cols].astype(str)

# Apply One-Hot Encoding correctly
data_onehot = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

# Verify changes
print("New shape after One-Hot Encoding:", data_onehot.shape)
print("Newly created One-Hot Encoded columns:", set(data_onehot.columns) - set(data.columns))

New shape after One-Hot Encoding: (41188, 54)
Newly created One-Hot Encoded columns: {'job_student', 'job_unknown', 'contact_telephone', 'month_sep', 'loan_unknown', 'job_entrepreneur', 'month_mar', 'education_unknown', 'default_yes', 'job_blue-collar', 'month_jul', 'loan_yes', 'marital_single', 'month_nov', 'month_oct', 'day_of_week_tue', 'job_retired', 'housing_unknown', 'education_university.degree', 'job_services', 'education_basic.9y', 'job_unemployed', 'education_professional.course', 'day_of_week_mon', 'housing_yes', 'month_aug', 'day_of_week_thu', 'marital_unknown', 'default_unknown', 'marital_married', 'month_may', 'education_illiterate', 'education_basic.6y', 'job_technician', 'day_of_week_wed', 'job_self-employed', 'month_jun', 'job_housemaid', 'poutcome_nonexistent', 'education_high.school', 'job_management', 'poutcome_success', 'month_dec'}


Applying PCA since we have too many columns

In [6]:
# Define X (features) and y (target)
X = data_onehot.drop(columns=['y'])  # Remove target variable
y = data_onehot['y']  # Target variable

# Apply PCA - Set a fixed number of components instead of variance-based reduction
pca = PCA(n_components=21)  # Keep 21 dimensions instead of reducing too aggressively

# Fit PCA and transform X
X_pca = pca.fit_transform(X)

# Check new shape after PCA
print("New shape after PCA:", X_pca.shape)


New shape after PCA: (41188, 21)


In [7]:
# Check explained variance ratio
explained_variance = np.cumsum(pca.explained_variance_ratio_)  # Cumulative variance
print("Variance retained with 21 components:", explained_variance[11])  # Index 11 = 12th component

Variance retained with 21 components: 0.9999761148700061


In [8]:
# Define features (X) and target variable (y)
X = data.drop(columns=['y'])  # Features
y = data['y']  # Target variable

# Perform train-test split (with stratification)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Check class distribution in train & test sets
print("Train set class distribution:\n", y_train.value_counts(normalize=True))
print("Test set class distribution:\n", y_test.value_counts(normalize=True))

Train set class distribution:
 y
0    0.887344
1    0.112656
Name: proportion, dtype: float64
Test set class distribution:
 y
0    0.887351
1    0.112649
Name: proportion, dtype: float64


Dataset is highly imbalanced, so we do SMOTE oversampling to handle it

In [9]:
# Split data into training & testing sets (before applying SMOTE)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, stratify=y, random_state=42)

# Apply SMOTE on training set only 
smote = SMOTE(sampling_strategy=0.5, random_state=42)  # Increase minority class to 50% of majority class
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check new class distribution
print("Original train set class distribution:\n", y_train.value_counts(normalize=True))
print("Resampled train set class distribution:\n", y_train_resampled.value_counts(normalize=True))

Original train set class distribution:
 y
0    0.887344
1    0.112656
Name: proportion, dtype: float64
Resampled train set class distribution:
 y
0    0.666667
1    0.333333
Name: proportion, dtype: float64


#### Decision Tree Classifier

In [10]:
dt_model = DecisionTreeClassifier(max_depth=10, random_state=42)  # Limit depth to prevent overfitting
dt_model.fit(X_train_resampled, y_train_resampled)  

# Predictions on the original test set
y_pred_dt = dt_model.predict(X_test)
y_proba_dt = dt_model.predict_proba(X_test)[:, 1]  # Probability scores for ROC-AUC

# Evaluate Model Performance
print("Decision Tree - Classification Report:\n", classification_report(y_test, y_pred_dt))
print("Decision Tree - ROC-AUC Score:", roc_auc_score(y_test, y_proba_dt))
print("Decision Tree - F1-Score (0):", f1_score(y_test, y_pred_dt, pos_label=0))  # F1-score for class 0


Decision Tree - Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.90      0.93      7310
           1       0.48      0.77      0.59       928

    accuracy                           0.88      8238
   macro avg       0.73      0.83      0.76      8238
weighted avg       0.91      0.88      0.89      8238

Decision Tree - ROC-AUC Score: 0.876953659370725
Decision Tree - F1-Score (0): 0.9303779482807616


#### Naive Bayes Classifier

Now we use standardization for Naïve Bayes because it assumes features follow a normal distribution, and standardization ensures equal importance by making all features have mean = 0 and standard deviation = 1.

In [11]:
# Initialize StandardScaler
scaler = StandardScaler()

# Scale only the training set (to avoid data leakage)
X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)  # Apply the same transformation to the test set

In [12]:
# Train Naïve Bayes on the scaled resampled training set
nb_model = GaussianNB()
nb_model.fit(X_train_resampled_scaled, y_train_resampled)

# Predictions
y_pred_nb = nb_model.predict(X_test_scaled)
y_proba_nb = nb_model.predict_proba(X_test_scaled)[:, 1]  # Probability scores for ROC-AUC

# Evaluate Model
print("Naïve Bayes - Classification Report:\n", classification_report(y_test, y_pred_nb))
print("Naïve Bayes - ROC-AUC Score:", roc_auc_score(y_test, y_proba_nb))
print("Naïve Bayes - F1-Score (0):", f1_score(y_test, y_pred_nb, pos_label=0))  # F1-score for class 0


Naïve Bayes - Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.91      0.93      7310
           1       0.49      0.65      0.56       928

    accuracy                           0.89      8238
   macro avg       0.72      0.78      0.75      8238
weighted avg       0.90      0.89      0.89      8238

Naïve Bayes - ROC-AUC Score: 0.8990604657059297
Naïve Bayes - F1-Score (0): 0.9338825665014313


Here, we want to focus more on people likely to say no, so we can focus more of our resources on trying to get them to agree. So, it is more important to reduce False Negatives, i.e., the number of people who actually say no but we predict that we said yes. So, we focus on maximising recall.

But, we also want to keep some balance in precision and recall, so we don't bother people who are likely to say yes already by repeatedly calling. 

Thus, the recall for 0 is more important here. Since we get a good score, we are satisfied with this model.

In [13]:
# Function to compute selected metrics
def compute_metrics(y_true, y_pred, y_proba):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Recall (0)": recall_score(y_true, y_pred, pos_label=0),  # Only Recall for class 0
        "F1-score (0)": f1_score(y_true, y_pred, pos_label=0),
        "ROC-AUC": roc_auc_score(y_true, y_proba),
    }

# Compute metrics for Decision Tree
dt_metrics = compute_metrics(y_test, y_pred_dt, y_proba_dt)

# Compute metrics for Naïve Bayes
nb_metrics = compute_metrics(y_test, y_pred_nb, y_proba_nb)

# Create a comparison table
comparison_table = pd.DataFrame({
    "Metric": dt_metrics.keys(),
    "Decision Tree": dt_metrics.values(),
    "Naïve Bayes": nb_metrics.values()
})

# Print the comparison table
print(comparison_table.to_string(index=False))


      Metric  Decision Tree  Naïve Bayes
    Accuracy       0.881039     0.885045
  Recall (0)       0.895759     0.914911
F1-score (0)       0.930378     0.933883
     ROC-AUC       0.876954     0.899060


Naive Bayes produces better results comparitively.

# Task 2

In [14]:
# Load the dataset
df = pd.read_csv("C:/Users/ashi/Downloads/Data for repository.csv")

df.head(5)  


Unnamed: 0,Movie Name,Release Period,Whether Remake,Whether Franchise,Genre,New Actor,New Director,New Music Director,Lead Star,Director,Music Director,Number of Screens,Revenue(INR),Budget(INR)
0,Golden Boys,Normal,No,No,suspense,Yes,No,No,Jeet Goswami,Ravi Varma,Baba Jagirdar,5,5000000,85000
1,Kaccha Limboo,Holiday,No,No,drama,Yes,No,Yes,Karan Bhanushali,Sagar Ballary,Amardeep Nijjer,75,15000000,825000
2,Not A Love Story,Holiday,No,No,thriller,No,No,No,Mahie Gill,Ram Gopal Verma,Sandeep Chowta,525,75000000,56700000
3,Qaidi Band,Holiday,No,No,drama,Yes,No,No,Aadar Jain,Habib Faisal,Amit Trivedi,800,210000000,4500000
4,Chaatwali,Holiday,No,No,adult,Yes,Yes,Yes,Aadil Khan,Aadil Khan,Babloo Ustad,1,1000000,1075000


In [15]:
# Create a new column for hit or flop
df['hit_or_flop'] = df.apply(lambda row: 'hit' if row['Revenue(INR)'] > row['Budget(INR)'] else 'flop', axis=1)

df.head(5)

Unnamed: 0,Movie Name,Release Period,Whether Remake,Whether Franchise,Genre,New Actor,New Director,New Music Director,Lead Star,Director,Music Director,Number of Screens,Revenue(INR),Budget(INR),hit_or_flop
0,Golden Boys,Normal,No,No,suspense,Yes,No,No,Jeet Goswami,Ravi Varma,Baba Jagirdar,5,5000000,85000,hit
1,Kaccha Limboo,Holiday,No,No,drama,Yes,No,Yes,Karan Bhanushali,Sagar Ballary,Amardeep Nijjer,75,15000000,825000,hit
2,Not A Love Story,Holiday,No,No,thriller,No,No,No,Mahie Gill,Ram Gopal Verma,Sandeep Chowta,525,75000000,56700000,hit
3,Qaidi Band,Holiday,No,No,drama,Yes,No,No,Aadar Jain,Habib Faisal,Amit Trivedi,800,210000000,4500000,hit
4,Chaatwali,Holiday,No,No,adult,Yes,Yes,Yes,Aadil Khan,Aadil Khan,Babloo Ustad,1,1000000,1075000,flop


In [16]:
# Count unique values in the Genre column
num_unique_genres = df["Genre"].nunique()
print(f"Number of unique genres: {num_unique_genres}")

# Display all unique genres
unique_genres = df["Genre"].unique()
print("Unique genres:", unique_genres)

Number of unique genres: 14
Unique genres: ['suspense' 'drama' 'thriller' 'adult' 'comedy' 'action' 'love_story'
 'rom__com' 'horror' 'fantasy' 'masala' 'mythological' 'animation'
 'documentary']


In [17]:
# Count unique values in the Lead Star column
num_unique_lead_star = df["Lead Star"].nunique()
print(f"Number of unique lead stars: {num_unique_lead_star}")

Number of unique lead stars: 764


In [18]:
# Count unique values in the Director column
num_unique_directors = df["Director"].nunique()
print(f"Number of unique directors: {num_unique_directors}")

Number of unique directors: 1048


In [19]:
# Count unique values in the Music Director column
num_unique_music_directors = df["Music Director"].nunique()
print(f"Number of unique music directors: {num_unique_music_directors}")

Number of unique music directors: 630


In [20]:
df.drop(columns=["Movie Name", "Lead Star", "Director", "Music Director","Genre"], inplace=True)
df.head()

Unnamed: 0,Release Period,Whether Remake,Whether Franchise,New Actor,New Director,New Music Director,Number of Screens,Revenue(INR),Budget(INR),hit_or_flop
0,Normal,No,No,Yes,No,No,5,5000000,85000,hit
1,Holiday,No,No,Yes,No,Yes,75,15000000,825000,hit
2,Holiday,No,No,No,No,No,525,75000000,56700000,hit
3,Holiday,No,No,Yes,No,No,800,210000000,4500000,hit
4,Holiday,No,No,Yes,Yes,Yes,1,1000000,1075000,flop


In [21]:
# Initialize the label encoder
label_encoder = LabelEncoder()
binary_cols = ['Release Period','Whether Remake','Whether Franchise','New Actor','New Director','New Music Director','hit_or_flop']

# Apply label encoding to binary columns
for col in binary_cols:
    df[col] = label_encoder.fit_transform(df[col])

df[binary_cols].head()  # Display the first few rows of encoded columns


Unnamed: 0,Release Period,Whether Remake,Whether Franchise,New Actor,New Director,New Music Director,hit_or_flop
0,1,0,0,1,0,0,1
1,0,0,0,1,0,1,1
2,0,0,0,0,0,0,1
3,0,0,0,1,0,0,1
4,0,0,0,1,1,1,0


In [22]:
# Define features (X) and target variable (y)
X = df.drop(columns=["hit_or_flop"])  # Exclude target variable
y = df["hit_or_flop"]

# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [23]:
# Count occurrences of each class
print(y_train.value_counts())

hit_or_flop
1    981
0    377
Name: count, dtype: int64


#### Decision Tree Classifier

In [24]:
# Initialize and train the Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

In [25]:
# Predict on test data
y_pred_dt = dt_model.predict(X_test)

# Print detailed classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred_dt))
print("Decision Tree - ROC-AUC Score:", roc_auc_score(y_test, y_pred_dt))
print("Decision Tree - F1-Score:", f1_score(y_test, y_pred_dt))


Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.89      0.92        95
           1       0.96      0.98      0.97       245

    accuracy                           0.96       340
   macro avg       0.95      0.94      0.94       340
weighted avg       0.96      0.96      0.96       340

Decision Tree - ROC-AUC Score: 0.9371643394199785
Decision Tree - F1-Score: 0.9696969696969697


#### Naive Bayes Classifier

In [26]:
# Initialize Gaussian Naïve Bayes
nb_model = GaussianNB()

# Train the model
nb_model.fit(X_train, y_train)

# Make predictions
y_pred_nb = nb_model.predict(X_test)

# Print classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred_nb))
print("Naive Bayes - ROC-AUC Score:", roc_auc_score(y_test, y_pred_nb))
print("Naive Bayes - F1-Score:", f1_score(y_test, y_pred_nb))



Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.57      0.68        95
           1       0.85      0.96      0.90       245

    accuracy                           0.85       340
   macro avg       0.84      0.76      0.79       340
weighted avg       0.85      0.85      0.84       340

Naive Bayes - ROC-AUC Score: 0.7617615467239527
Naive Bayes - F1-Score: 0.9


In [29]:
# Compute metrics for Decision Tree
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)

# Compute metrics for Naïve Bayes
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb)
recall_nb = recall_score(y_test, y_pred_nb)
f1_nb = f1_score(y_test, y_pred_nb)

comparison_table = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1-score"],
    "Decision Tree": [accuracy_dt, precision_dt, recall_dt, f1_dt],
    "Naïve Bayes": [accuracy_nb, precision_nb, recall_nb, f1_nb]
})

print(comparison_table.to_string(index=False))

   Metric  Decision Tree  Naïve Bayes
 Accuracy       0.955882     0.847059
Precision       0.960000     0.850909
   Recall       0.979592     0.955102
 F1-score       0.969697     0.900000


The Decision Tree classifier consistently out performs Naive Bayes. This is possibly due to the overall smaller dataset size.