In [1]:
import sys
assert sys.version_info >= (3, 5)

import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
import tarfile
import urllib
import pandas as pd
import urllib.request

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
 
import category_encoders as ce 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import KFold
import tensorflow as tf
from sklearn.linear_model import Perceptron


In [2]:
def load_data():
    csv_path ="./train-set.csv"
    return pd.read_csv(csv_path)

In [3]:
pd.options.display.float_format = '{:,.0f}'.format

df = load_data() # "housing" is a pandas DataFrame object containing all the data
df.head() # Display the top five rows of the DataFrame
#Q1 How many attribuites in the data set? Desribe the attribuites .

Unnamed: 0,CustomerID,Gender,Married,Age,Graduated,Profession,WorkExperience,SpendingScore,FamilySize,Category,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4,Category 4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3,Category 4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1,Category 6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2,Category 6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6,Category 6,A


In [6]:
df['SpendingScore'].value_counts()

Low        4878
Average    1974
High       1216
Name: SpendingScore, dtype: int64

In [7]:
df['Category'].value_counts()

Category 6    5238
Category 4    1089
Category 3     822
Category 2     422
Category 7     203
Category 1     133
Category 5      85
Name: Category, dtype: int64

In [8]:
df['Segmentation'].value_counts()

D    2268
A    1972
C    1970
B    1858
Name: Segmentation, dtype: int64

In [9]:
dictionary_SpendingScore=[{'col':'SpendingScore','mapping':{'Low':1,'Average':2,'High':3}}]
encoder=ce.OrdinalEncoder(cols='SpendingScore',mapping=dictionary_SpendingScore)

x_train_encoded =encoder.fit_transform(df)

dictionary_Category=[{'col':'Category','mapping':{'Category 1':1,'Category 2':2,'Category 3':3,'Category 4':4,'Category 5':5,'Category 6':6,'Category 7':7}}]
encoder=ce.OrdinalEncoder(cols='Category',mapping=dictionary_Category)

x_train_encoded =encoder.fit_transform(x_train_encoded)

dictionary_Segmentation=[{'col':'Segmentation','mapping':{'A':1,'B':2,'C':3,'D':4}}]
encoder=ce.OrdinalEncoder(cols='Category',mapping=dictionary_Segmentation)

x_train_encoded =encoder.fit_transform(x_train_encoded)

#coding the Category to number
x_train_encoded.Gender = pd.Categorical(x_train_encoded.Gender)
x_train_encoded['Gender'] = x_train_encoded.Gender.cat.codes

x_train_encoded.Married = pd.Categorical(x_train_encoded.Married)
x_train_encoded['Married'] = x_train_encoded.Married.cat.codes

x_train_encoded.Graduated = pd.Categorical(x_train_encoded.Graduated)
x_train_encoded['Graduated'] = x_train_encoded.Graduated.cat.codes

x_train_encoded.Profession = pd.Categorical(x_train_encoded.Profession)
x_train_encoded['Profession'] = x_train_encoded.Profession.cat.codes



x_train_encoded.head() 

Unnamed: 0,CustomerID,Gender,Married,Age,Graduated,Profession,WorkExperience,SpendingScore,FamilySize,Category,Segmentation
0,462809,1,0,22,0,5,1.0,1,4,4,4
1,462643,0,1,38,1,2,,2,3,4,1
2,466315,0,1,67,1,2,1.0,1,1,6,2
3,461735,1,1,67,1,7,0.0,3,2,6,2
4,462669,0,1,40,1,3,,3,6,6,1


In [10]:

x_train_encoded["WorkExperience"] = pd.to_numeric(x_train_encoded["WorkExperience"])


nan_value = float("NaN")
#x_train_encoded.replace(0, nan_value, inplace=True)
x_train_encoded.dropna(subset = ["FamilySize"], inplace=True)
x_train_encoded.dropna(subset = ["WorkExperience"], inplace=True)
 
x_train_encoded["WorkExperience"] = x_train_encoded["WorkExperience"].astype(int)

X=np.array(x_train_encoded[['Gender','Married','Age','Graduated','Profession','WorkExperience','SpendingScore','FamilySize','Category']])

y=np.array(x_train_encoded['Segmentation'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
print(X_train)
print(X_train.shape)
print(y)
print(y.shape)

[[ 1.  1. 39. ...  2.  2.  1.]
 [ 0.  0. 27. ...  1.  4.  6.]
 [ 0.  1. 41. ...  2.  2.  6.]
 ...
 [ 0.  0. 38. ...  1.  8.  2.]
 [ 1.  1. 40. ...  1.  2.  6.]
 [ 0.  0. 55. ...  1.  1.  6.]]
(4878, 9)
[4 2 2 ... 4 2 2]
(6969,)


In [11]:
per_clf = Perceptron(max_iter=1000, tol=1e-3, random_state=42)
per_clf.fit(X_train, y_train)

Perceptron(random_state=42)

In [12]:
y_pred = per_clf.predict(X_test)

In [14]:
yTruePred=np.empty(len(y_pred), dtype=int)
for i in range(len(y_pred)):
    if y_pred[i]==y_test[i]: 
      yTruePred[i] = y_pred[i]

In [14]:
print(len(y_test))
print(len(yTruePred))

2091
2091
