In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/sales-dataset/sales.csv')
df.head()

Data Cleaning

In [None]:
#Replacing missing values with unknown
df['education'].fillna('Unknown', inplace=True)
df['marriage'].fillna('Unknown', inplace=True)
df['house_owner'].fillna('Unknown', inplace=True)

In [None]:
df.replace({'gender' : { 'M' : 'Male', 'F' : 'Female'}}, inplace=True)
df.replace({'age' : { '1_Unk' : 'Unknown', '2_<=25' : '<=25', '3_<=35' : '26-35',
                    '4_<=45' : '36-45', '5_<=55' : '46-55', '6_<=65' : '56-65',
                    '7_>65' : '>65'}}, inplace=True)
df.replace({'mortgage' : { '1Low' : 'Low', '2Med' : 'Medium', '3High' : 'High'}}, inplace=True)
df.replace({'Y' : 'Yes', 'N' : 'No'}, inplace=True)
df.replace('U', 'Unknown', inplace=True)

**Explantory descriptive analysis**

In [None]:
education = df['education'].value_counts()
m_height = education.values.tolist() #Provides numerical values
education.axes #Provides row labels
education_labels = education.axes[0].tolist() #Converts index object to list

#=====PLOT Preparations and Plotting====#
ind = np.arange(6)  # the x locations for the groups
width = 0.7        # the width of the bars
colors = ['#DEB887','#778899','#DC143C','#FFFF99','#f8f8ff','#FFFF88']
#FFFFF0
fig, ax = plt.subplots(figsize=(10,7))
education_bars = ax.bar(ind, m_height , width, color=colors)

ax.set_xlabel("education",fontsize=20)
ax.set_ylabel('amount',fontsize=20)
ax.set_title('customer data based on education level',fontsize=22)
ax.set_xticks(ind) #Positioning on the x axis
ax.set_xticklabels(('Some College', 'Bach','HS','Grad','<HS', 'Unknown'),
                  fontsize = 12)

#Auto-labels the number of mushrooms for each bar color.
def autolabel(rects,fontsize=14):
    """
    Attach a text label above each bar displaying its height
    """
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2., 1*height,'%d' % int(height),
                ha='center', va='bottom',fontsize=fontsize)
autolabel(education_bars)        
plt.show() #Display bars. 

In [None]:
didnt_cc = [] #didn't buy
did_cc = []    #did buy
for education in education_labels:
    size = len(df[df['education'] == education].index)
    did = len(df[(df['education'] == education) & (df['flag'] == 'Yes')].index)
    did_cc.append(did)
    didnt_cc.append(size-did)
    
#=====PLOT Preparations and Plotting====#
width = 0.40
fig, ax = plt.subplots(figsize=(12,7))
did_buy = ax.bar(ind, did_cc , width, color='#ADFF2F')
didnt_buy = ax.bar(ind+width, didnt_cc , width, color='#DA70D6')

#Add some text for labels, title and axes ticks
ax.set_xlabel("Education",fontsize=20)
ax.set_ylabel('amount of customers',fontsize=20)
ax.set_title('bought or didnt bought product Based on education',fontsize=22)
ax.set_xticks(ind + width / 2) #Positioning on the x axis
ax.set_xticklabels(('Some College', 'Bach','HS','Grad','<HS','Unknown'),
                  fontsize = 12)
ax.legend((did_buy,didnt_buy),('buy','didnt'),fontsize=17)
autolabel(did_buy, 10)
autolabel(didnt_buy, 10)
plt.show()
print(did_cc)
print(didnt_cc)

In [None]:
occupation = df['occupation'].value_counts()
m_height = occupation.values.tolist() #Provides numerical values
occupation.axes #Provides row labels
occupation_labels = occupation.axes[0].tolist() #Converts index object to list

#=====PLOT Preparations and Plotting====#
ind = np.arange(6)  # the x locations for the groups
width = 0.7        # the width of the bars
colors = ['#DEB887','#778899','#DC143C','#FFFF99','#f8f8ff','#F0DC82']
#FFFFF0
fig, ax = plt.subplots(figsize=(10,7))
occupation_bars = ax.bar(ind, m_height , width, color=colors)

ax.set_xlabel("occupation",fontsize=20)
ax.set_ylabel('amount',fontsize=20)
ax.set_title('customer data based on their occupation',fontsize=22)
ax.set_xticks(ind) #Positioning on the x axis
ax.set_xticklabels(('Professional', 'Sales/Service','Blue Collar','Retired','Others','Farm'),
                  fontsize = 12)

#Auto-labels the number of mushrooms for each bar color.
def autolabel(rects,fontsize=14):
    """
    Attach a text label above each bar displaying its height
    """
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2., 1*height,'%d' % int(height),
                ha='center', va='bottom',fontsize=fontsize)
autolabel(occupation_bars)        
plt.show() #Display bars. 

In [None]:
didnt_cc = [] #Didn't buy
did_cc = []    #did buy
for occupation in occupation_labels:
    size = len(df[df['occupation'] == occupation].index)
    did = len(df[(df['occupation'] == occupation) & (df['flag'] == 'Yes')].index)
    did_cc.append(did)
    didnt_cc.append(size-did)
    
#=====PLOT Preparations and Plotting====#
width = 0.40
fig, ax = plt.subplots(figsize=(12,7))
did_buy = ax.bar(ind, did_cc , width, color='#ADFF2F')
didnt_buy = ax.bar(ind+width, didnt_cc , width, color='#DA70D6')

#Add some text for labels, title and axes ticks
ax.set_xlabel("occupation",fontsize=20)
ax.set_ylabel('amount of customers',fontsize=20)
ax.set_title('bought or didnt bought product Based on their occupation',fontsize=22)
ax.set_xticks(ind + width / 2) #Positioning on the x axis
ax.set_xticklabels(('Professional', 'Sales/Service','Blue Collar','Retired','Others','Farm'),
                  fontsize = 12)
ax.legend((did_buy,didnt_buy),('buy','didnt'),fontsize=17)
autolabel(did_buy, 10)
autolabel(didnt_buy, 10)
plt.show()
print(did_cc)
print(didnt_cc)

**Machine learning**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
X = df.drop('flag', axis=1)
y = df['flag']

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [None]:
# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely label encoded
good_label_cols = [col for col in object_cols if 
                   set(X_train[col]) == set(X_valid[col])]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be label encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Drop categorical columns that will not be encoded
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)

# Apply label encoder 
label_encoder = LabelEncoder()
for col in set (good_label_cols):
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_valid[col])

In [None]:
model = RandomForestClassifier(n_estimators=500)
model.fit(label_X_train, y_train)
preds = model.predict(label_X_valid)
accuracy = round(model.score(label_X_valid, y_valid) * 100, 2)

report = classification_report(
    digits=4,
    y_true=y_valid,
    y_pred = preds)
print('Accuracy', accuracy)
print(report)
pd.crosstab(y_valid, preds, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [None]:
plt.figure(figsize=(12,6))
feat_importances = pd.Series(model.feature_importances_, index=label_X_train.columns)
feat_importances.nlargest(10).plot(kind='barh')