In [1]:
# import necessary packages
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.neighbors import NearestNeighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics  import f1_score,accuracy_score

In [2]:
INPUT_FILE = "RetailCustomerSales.csv"
OUTPUT_FILE = "RetailCustomerSales-Filled.csv"

In [3]:
# Read Csv File And Make dataFrame
dtypes = {
    'CustomerID':         np.int64,
    'ItemID':            np.object,
    'Sex':               np.object,
    'Age':               np.object,
    'Profession':         np.int64,
    'CityType':          np.object,
    'YearsInCity':       np.object,
    'Married':          np.float64,
    'ItemCategory1':      np.int64,
    'ItemCategory2':    np.float64,
    'ItemCategory3':    np.float64,
    'Amount':             np.int64,
}
df = pd.read_csv(INPUT_FILE, dtype=dtypes)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  'ItemID':            np.object,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  'Sex':               np.object,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  'Age':               np.object,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  'CityType':          np.object,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  'YearsInCity':       np.object,


In [4]:
# preprocess the dataframe and normilize data
class MultiColumnLabelEncoder:

    def __init__(self, columns=None):
        self.columns = columns # array of column names to encode


    def fit(self, X, y=None):
        self.encoders = {}
        columns = X.columns if self.columns is None else self.columns
        for col in columns:
            self.encoders[col] = preprocessing.LabelEncoder().fit(X[col])
        return self


    def transform(self, X):
        output = X.copy()
        columns = X.columns if self.columns is None else self.columns
        for col in columns:
            output[col] = self.encoders[col].transform(X[col])
        return output


    def fit_transform(self, X, y=None):
        return self.fit(X,y).transform(X).fillna(value=-1)


    def inverse_transform(self, X):
        output = X.copy()
        columns = X.columns if self.columns is None else self.columns
        for col in columns:
            output[col] = self.encoders[col].inverse_transform(X[col])
        return output.replace(-1, np.NaN)

multi = MultiColumnLabelEncoder(columns=['ItemID','Sex', 'Age', 'CityType', 'YearsInCity'])
df = multi.fit_transform(df)

In [5]:
# seperate dataset and target rows
dataset = df[df['Married'] != -1]
target_rows = df[df['Married'] == -1]


In [6]:
# first we split our data into input and output
# y is the output and is stored in "Married" column of dataframe
# X contains the other columns and are features or input
Y = dataset.Married
dataset.drop(['Married'], axis=1, inplace=True)
X = dataset

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset.drop(['Married'], axis=1, inplace=True)


In [7]:
# Now we split the dataset in train and test part
# here the train set is 75% and test set is 25%
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=2)

In [8]:
# We are trying to classify marriage status so 
# here training DecisionTreeClassifier which is fit to our dataset
DT = DecisionTreeClassifier()
DT.fit(X_train,y_train)

DecisionTreeClassifier()

In [9]:
# test accuracy of model
pred = DT.predict(X_test)
print("accuracy_score: ", accuracy_score(y_test,pred))
print("f1_score: ", f1_score(y_test,pred))

accuracy_score:  0.9995284185787618
f1_score:  0.9994266540091923


In [10]:
# remove Married column from target rows 
target_rows.drop(['Married'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_rows.drop(['Married'], axis=1, inplace=True)


In [11]:
pred = DT.predict(target_rows)

In [12]:
# adding prediction results to our dataframe
target_rows.insert(loc=7, column='Married', value=pred)

In [13]:
# reverse transformation so we have the values as input file values
target_rows = multi.inverse_transform(target_rows)

In [14]:
# save the result in csv format to OUTPUT_FILE
target_rows.to_csv(OUTPUT_FILE)

In [15]:
df

Unnamed: 0,CustomerID,ItemID,Sex,Age,Profession,CityType,YearsInCity,Married,ItemCategory1,ItemCategory2,ItemCategory3,Amount
0,1000001,670,0,0,10,0,2,0.0,3,-1.0,-1.0,8370
1,1000001,2374,0,0,10,0,2,0.0,1,6.0,14.0,15200
2,1000001,850,0,0,10,0,2,0.0,12,-1.0,-1.0,1422
3,1000001,826,0,0,10,0,2,0.0,12,14.0,-1.0,1057
4,1000002,2732,1,6,16,2,4,0.0,8,-1.0,-1.0,7969
...,...,...,...,...,...,...,...,...,...,...,...,...
537572,1004737,1829,1,3,16,2,1,0.0,1,2.0,-1.0,11664
537573,1004737,1018,1,3,16,2,1,0.0,1,15.0,16.0,19196
537574,1004737,3316,1,3,16,2,1,0.0,8,15.0,-1.0,8043
537575,1004737,2736,1,3,16,2,1,0.0,5,-1.0,-1.0,7172
