In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier


file_path = 'C:\\Users\\qaism\\OneDrive - University of Virginia\\Documents\\GitHub\\assignment3\\data\\car_data.csv'
df = pd.read_csv(file_path)

In [12]:
print("First few rows of the dataset:")
print(df.head())

print("Shape of the dataset:")
print(df.shape)

First few rows of the dataset:
   User ID Gender  Age  AnnualSalary  Purchased
0      385   Male   35         20000          0
1      681   Male   40         43500          0
2      353   Male   49         74000          0
3      895   Male   40        107500          1
4      661   Male   25         79000          0
Shape of the dataset:
(1000, 5)


In [3]:
# Summarize the variables
summary_stats = df.describe()

# Check for missing values
missing_values = df.isnull().sum()

# Convert Gender into a dummy variable
df['Gender'] = df['Gender'].apply(lambda x: 0 if x == 'Male' else 1)

# Create a matrix X of predictors and an outcome y
X = df[['Age', 'AnnualSalary']]
y = df['Purchased']

print("Summary Statistics:")
print(summary_stats)

print("Missing Values:")
print(missing_values)

Summary Statistics:
           User ID          Age   AnnualSalary    Purchased
count  1000.000000  1000.000000    1000.000000  1000.000000
mean    500.500000    40.106000   72689.000000     0.402000
std     288.819436    10.707073   34488.341867     0.490547
min       1.000000    18.000000   15000.000000     0.000000
25%     250.750000    32.000000   46375.000000     0.000000
50%     500.500000    40.000000   72000.000000     0.000000
75%     750.250000    48.000000   90000.000000     1.000000
max    1000.000000    63.000000  152500.000000     1.000000
Missing Values:
User ID         0
Gender          0
Age             0
AnnualSalary    0
Purchased       0
dtype: int64


In [4]:
# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the data in 'X'
X_normalized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)
# Show the number of samples in the training and testing sets
print("Number of samples in training set:", len(X_train))
print("Number of samples in testing set:", len(X_test))

Number of samples in training set: 800
Number of samples in testing set: 200


In [25]:
# MaxMin-normalize Age and AnnualSalary
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)# Determine the optimal number of neighbors using GridSearchCV
params = {'n_neighbors': range(1, 20)}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, params, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the optimal number of neighbors
optimal_neighbors = grid_search.best_params_['n_neighbors']

# Initialize the KNeighborsClassifier with the optimal number of neighbors
knn_optimal = KNeighborsClassifier(n_neighbors=optimal_neighbors)

# Fit the model to the scaled training data
knn_optimal.fit(X_train_scaled, y_train)

# Make predictions on the scaled test set
y_pred = knn_optimal.predict(X_test_scaled)

In [26]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the KNeighborsClassifier with the optimal number of neighbors
knn_optimal = KNeighborsClassifier(n_neighbors=5)

# Fit the model to the training data
knn_optimal.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_optimal.predict(X_test)

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
print(f"Accuracy: {accuracy:.2f}")

Confusion Matrix:
[[102  10]
 [ 30  58]]
Accuracy: 0.80


Confusion Matrix
True Negative (TN): 102 - The model correctly predicted that a sale would not occur 102 times.
False Positive (FP): 10 - The model incorrectly predicted a sale 10 times when it did not actually occur.
False Negative (FN): 30 - The model incorrectly predicted no sale 30 times when a sale actually did occur.
True Positive (TP): 58 - The model correctly predicted that a sale would occur 58 times.

Accuracy
The accuracy of your model is 0.80 or 80, which is pretty good for a classification problem.

Interpretation
The model predicts a sale when one fails to occur in 10 cases (FP).
The model predicts no sale when one does occur in 30 cases (FN).
Overall, the model provides accurate predictions 80 of the time.

In [29]:
# Convert Gender into a dummy variable
test_df['Gender'] = test_df['Gender'].apply(lambda x: 0 if x == 'Male' else 1)

# Separate men and women in the test dataset
test_df_men = test_df[test_df['Gender'] == 0]
test_df_women = test_df[test_df['Gender'] == 1]

# Now check the shape again
print("Shape of test_df_men:", test_df_men.shape)
print("Shape of test_df_women:", test_df_women.shape)
# Separate X and y for men and women
X_test_men = test_df_men[['Age', 'AnnualSalary']]
y_test_men = test_df_men['Purchased']

X_test_women = test_df_women[['Age', 'AnnualSalary']]
y_test_women = test_df_women['Purchased']

# Make predictions for men and women separately using the fitted knn_optimal model
y_pred_men = knn_optimal.predict(X_test_men)
y_pred_women = knn_optimal.predict(X_test_women)

# Compute confusion matrices and accuracy scores
conf_matrix_men = confusion_matrix(y_test_men, y_pred_men)
conf_matrix_women = confusion_matrix(y_test_women, y_pred_women)

accuracy_men = accuracy_score(y_test_men, y_pred_men)
accuracy_women = accuracy_score(y_test_women, y_pred_women)

print("Confusion Matrix for Men:", conf_matrix_men)
print("Accuracy for Men:", accuracy_men)
print("Confusion Matrix for Women:", conf_matrix_women)
print("Accuracy for Women:", accuracy_women)

Shape of test_df_men: (98, 4)
Shape of test_df_women: (102, 4)
Confusion Matrix for Men: [[57  3]
 [16 22]]
Accuracy for Men: 0.8061224489795918
Confusion Matrix for Women: [[45  7]
 [14 36]]
Accuracy for Women: 0.7941176470588235


The model's accuracy for predicting purchases is approximately 80.6% for men and 79.4% for women. The differences in accuracy between the two groups are relatively small, suggesting that the model performs similarly for both men and women.