In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [None]:

def load_data():
  df = pd.read_csv('iphone_purchase_data.csv')
  return df


In [None]:
def preprocess_data(df):
    df.isna().sum()

    le = LabelEncoder()
    df['Gender'] = le.fit_transform(df['Gender'])
    df['Location'] = le.fit_transform(df['Location'])
    df['Has Apple Products'] = le.fit_transform(df['Has Apple Products'])

    # Encode the "Employment Status" column with custom values
    employment_status_mapping = {
        'Employed Full-Time': 3,
        'Employed Part-Time': 2,
        'Student': 1,
        'Unemployed': 0
    }

    df['Employment Status'] = df['Employment Status'].map(employment_status_mapping)
    print(df)
    return df

In [None]:
def drop_features(df, threshold):
    corr_matrix = df.corr()
    co = corr_matrix['Will Purchase iPhone'].sort_values(ascending=False)
    print("Correlation between each feature and Will purchase iphone")
    print(co)
    cols_to_drop = []
    for i in co.index:
        if abs(co[i]) < threshold:
            cols_to_drop.append(i)
    df.drop(cols_to_drop, axis=1, inplace=True)
    return df

In [None]:
def feature_scaling(X_train, X_test):
    ss_X = StandardScaler()
    X_train = ss_X.fit_transform(X_train)
    X_test = ss_X.transform(X_test)
    return X_train, X_test

In [None]:
def fit_classifier(X_train, y_train, classifier):
    if classifier == "SVM":
        classifier = SVC(kernel = "linear", random_state=0)
    elif classifier == "Logistic Regression":
        classifier = LogisticRegression(random_state=0)
    elif classifier == "KNN":
        classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
    classifier.fit(X_train, y_train)
    return classifier

In [None]:
def make_prediction(X_test, classifier):
    y_pred = classifier.predict(X_test)
    return y_pred

In [None]:
def confusion_matrix(y_test, y_pred):
    cm = metrics.confusion_matrix(y_test, y_pred)
    print(cm)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print("Accuracy score:",accuracy)
    precision = metrics.precision_score(y_test, y_pred)
    print("Precision score:",precision)
    recall = metrics.recall_score(y_test, y_pred)
    print("Recall score:",recall)

In [None]:
def main():
    # Load data
    df = load_data()

    # Encode categorical features
    df = preprocess_data(df)

    # Drop features with low correlation
    df = drop_features(df, 0.15)

    # Split data into training and test sets
    X = df.drop('Will Purchase iPhone', axis=1)
    y = df['Will Purchase iPhone']

    # Split data into training and test sets
    X = df.drop('Will Purchase iPhone', axis=1)
    y = df['Will Purchase iPhone']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


    # Oversample minority class (resampled)
    oversample = RandomOverSampler(random_state=42)
    X_train, y_train = oversample.fit_resample(X, y)

    # Feature scaling
    X_train, X_test = feature_scaling(X_train, X_test)


    classifier1 = fit_classifier(X_train, y_train, "KNN")
    classifier2 = fit_classifier(X_train, y_train, "Logistic Regression")
    classifier3 = fit_classifier(X_train, y_train, "SVM")
    
    print("KNN")
    y_pred1 = make_prediction(X_test, classifier1)
    confusion_matrix(y_test, y_pred1)
    
    print("Logistic Regression")
    y_pred2 = make_prediction(X_test, classifier2)
    confusion_matrix(y_test, y_pred2)
    
    print("SVM")
    y_pred3 = make_prediction(X_test, classifier3)
    confusion_matrix(y_test, y_pred3)



In [None]:
if __name__ == "__main__":
    main()

     Age  Gender  Salary  Employment Status  Location  Has Apple Products  \
0     34       1   89000                  1         2                   1   
1     59       2   88000                  1         2                   0   
2     31       2   95000                  3         0                   1   
3     58       2  201000                  3         0                   1   
4     46       1  166000                  1         1                   0   
..   ...     ...     ...                ...       ...                 ...   
995   41       0   74000                  1         1                   0   
996   31       1  153000                  1         1                   0   
997   47       1  247000                  3         2                   0   
998   20       0  239000                  0         2                   0   
999   63       2  267000                  0         2                   0   

     Will Purchase iPhone  
0                       0  
1                  