In [107]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cs770-assignment-2a/sample_submission.csv
/kaggle/input/cs770-assignment-2a/train_data.csv
/kaggle/input/cs770-assignment-2a/test_data.csv


In [71]:
# Name: Shriya Reddy Ponnala

In [108]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import accuracy_score,f1_score, roc_auc_score,classification_report,confusion_matrix

# Loading the dataset
df = pd.read_csv('/kaggle/input/cs770-assignment-2a/train_data.csv')

# Display the first five rows of the dataset
df.head()

Unnamed: 0,Index,Gender,Age,Salary,Purchase Iphone
0,1,0,28,87000,0
1,2,0,57,33000,1
2,3,1,35,73000,0
3,4,0,48,29000,1
4,5,1,32,100000,1


In [109]:
#check if we have any  missing values or not 
#we don't have any missing values in this case
missing_values = df.isnull().sum()
print(missing_values)

Index              0
Gender             0
Age                0
Salary             0
Purchase Iphone    0
dtype: int64


In [110]:
#Splitting the dataset into features and targets 
X = df[['Gender', 'Age', 'Salary']]
y = df['Purchase Iphone']

#Seperating the dataset into a training(80%) set and a validation(20%) set 
#random_state to demonstrate that the split is repeatable.
X_train_set, X_val_set, y_train_set, y_val_set = train_test_split(X, y, test_size=0.2, random_state=42)

#Numerical mapping of gender data
df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1})

#Creating a Decision Tree Model with Hyperparameter Tuning
#Hyperparameters can be found in gd_param.
dtm_classifier = DecisionTreeClassifier()
gd_param = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5,6,7,8,9],
    'min_samples_split': [3,4,5,8],
    'min_samples_leaf': [2,3,4,5]
}

#To identify the best hyperparameters, use GridSearchCV.
grid_search = GridSearchCV(dtm_classifier, gd_param, cv=5)
grid_search.fit(X_train_set, y_train_set)

#Display the best hyperparameters
best_params = grid_search.best_params_
print("Best hyperparameters are:",best_params)


Best hyperparameters are: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 3}


In [111]:
#Create the DecisionTreeClassifier using the best hyperparameters and Utilize the training data to train the Decision Tree model.
best_dtm_classifier = DecisionTreeClassifier(**best_params)
best_dtm_classifier.fit(X_train_set, y_train_set)

In [112]:
#This demonstrates how well my model is doing and reporting its performance -> accuracy.
y_prediction = best_dtm_classifier.predict(X_val_set)
print("Accuracy:", accuracy_score(y_val_set, y_prediction))
print("F1 Score:", f1_score(y_val_set, y_prediction))
print("AUC-ROC:", roc_auc_score(y_val_set, y_prediction))
print('Classification Report:\n', classification_report(y_val_set, y_prediction))
print('Confusion Matrix:\n', confusion_matrix(y_val_set, y_prediction))


Accuracy: 0.8833333333333333
F1 Score: 0.8205128205128205
AUC-ROC: 0.8504784688995216
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.97      0.91        38
           1       0.94      0.73      0.82        22

    accuracy                           0.88        60
   macro avg       0.90      0.85      0.87        60
weighted avg       0.89      0.88      0.88        60

Confusion Matrix:
 [[37  1]
 [ 6 16]]


In [113]:
# Loading the test dataset from test_data file
test_data = pd.read_csv('/kaggle/input/cs770-assignment-2a/test_data.csv')

##check if we have any  missing values in test data file
missing_values = test_data.isnull().sum()
print(missing_values)


Index     0
Gender    0
Age       0
Salary    0
dtype: int64


In [114]:
#Creating test features by selecting certain columns from the test dataset ('Gender,' 'Age,' and 'Salary').
test_features = test_data[['Gender', 'Age', 'Salary']]
#Making predictions on the test features using the best decision tree classifier
test_predict = best_dtm_classifier.predict(test_features)
# Creating a DataFrame for test predictions with 'Index' and 'Purchase Iphone' columns
test_df_predict = pd.DataFrame({'Index': test_data['Index'], 'Purchase Iphone': test_predict})
# Save the modified sample submission to a CSV file
test_df_predict.to_csv('/kaggle/working/submission.csv', index=False)

In [115]:
#Explanation:--

#1.We divided the training dataset in an 80:20 ratio into a training set and a validation set. This enabled us to train our model on one set of data and then test it on another to evaluate its performance.

# We built a Decision Tree classifier and used GridSearchCV to find the model's best hyperparameters. This stage entailed creating hyperparameter grids and searching for the best combination.

#We used metrics like accuracy, F1-score, AUC-ROCand a confusion matrix to assess the model's performance on the validation set. This stage revealed information on how well the model was performing.

# We utilized the trained model to generate predictions on the test dataset, which we saved to the 'submission' CSV file.

#The accuracy of my model is 0.88 (88%), which means that it properly predicted the iPhone purchase choice for 88% of the test data samples.