In [5]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from scipy.stats import boxcox
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFE, RFECV 
from sklearn.svm import SVR, SVC

#defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'rachaeld-data445'
bucket = s3.Bucket(bucket_name)

#defining the csv file
file_key = 'Customers.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

customers = pd.read_csv(file_content_stream)
customers.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
0,1,Male,19,15000,39,Healthcare,1,4
1,2,Male,21,35000,81,Engineer,3,3
2,3,Female,20,86000,6,Engineer,1,1
3,4,Female,23,59000,77,Lawyer,0,2
4,5,Female,31,38000,40,Entertainment,2,6


In [6]:
customers = customers.drop(['CustomerID'], axis = 1)
customers.head()

Unnamed: 0,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
0,Male,19,15000,39,Healthcare,1,4
1,Male,21,35000,81,Engineer,3,3
2,Female,20,86000,6,Engineer,1,1
3,Female,23,59000,77,Lawyer,0,2
4,Female,31,38000,40,Entertainment,2,6


In [10]:

#Replacing na values in profession
#Identifying categorical variables
categorical_vars = customers.select_dtypes(include = 'object').columns.to_list()
print(categorical_vars)

frequent_values = customers[categorical_vars].mode().iloc[0].to_dict()
print(frequent_values)

## Filling missing values with most frequent labels
customers = customers.fillna(value = frequent_values)


['Gender', 'Profession']
{'Gender': 'Female', 'Profession': 'Artist'}


In [14]:
#Identifying numeric variables
numeric_vars = customers.select_dtypes(exclude = 'O').columns.to_list()
print(numeric_vars)

#Choosing median because the data isn't normal
#Compute the median values of the numerical variables and store in a dictionary
median_values = customers[numeric_vars].median().to_dict()
median_values

#Let’s replace missing data with the median
customers = customers.fillna(value = median_values)
customers.isna().sum()

['Age', 'Annual Income ($)', 'Spending Score (1-100)', 'Work Experience', 'Family Size']


Gender                    0
Age                       0
Annual Income ($)         0
Spending Score (1-100)    0
Profession                0
Work Experience           0
Family Size               0
dtype: int64

In [12]:
#defining the input and target variables
X = customers.drop(columns = ['Spending Score (1-100)'])
Y = customers['Spending Score (1-100)']

#splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
X_train

Unnamed: 0,Gender,Age,Annual Income ($),Profession,Work Experience,Family Size
653,Male,27,152717,Executive,1,6
807,Male,25,57535,Entertainment,1,7
1375,Female,78,132983,Healthcare,7,7
106,Female,66,43000,Homemaker,3,2
413,Male,80,59099,Marketing,1,7
...,...,...,...,...,...,...
917,Male,4,96543,Artist,9,1
670,Male,54,59163,Marketing,3,3
1295,Male,68,87040,Artist,9,1
1384,Female,50,104467,Artist,5,1
