In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_curve, roc_auc_score, r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold, LeaveOneOut

from sklearn.feature_selection import RFE

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso

### 
    Groove Mall is the centre of attraction in the city of demacia. Everyday around thousands of people visit for 
    different purpose, to watch movie, buy products, eat and so on. To increase the number of visiters the board
    decided to run a campaign to gauge the visiters based on there financial status. A series of questions were 
    asked to visiters visiting the mall. The data was then collected in a csv format. The task is to predict the 
    income class of a visiter based the attributes of the visiter. 
    Dataset Description: 
    • Sex: Male or Female[1, 2]
    • HouseholdMembers: [1, 9]
    • MaritalStatus: Married, Single, Divorced, Widowed, Engaged[1, 5]
    • Under18: 2x [O, 9]
    • Age: 10x [1, 7]
    • HouseholdStatus: [1, 3]
    • Education: Junior School, High School, College, PhD1, PhD2, No Education[1, 6]
    • TypeOfHome: 1 BHK, 2BHK, 3BHK, Villa, Homeless[1, 5]
    • Occupation: Finance, Teacher, Govt. Official, Police Officer, Engineer, Serviceman, Businessman,
    Designer, Farmer[1, 9]
    • EthnicClass: Indian, Russian, Chinese, Japanese, American, German, African or Murican[1, 8]
    • YearslnSf: Number[1, 5]
    • Language: One, Two, More than two[1, 3]
    • Duallncome: Yes, No, lnvalid[1, 3]
    • Income: 10000x{1, 2, 3, 4, 5, 6, 7, 8, 9}
    Task to be Performed: 
    • Read the dataset with no headers; Then put respective columns names and find the missing values in
    each column - Intermediate
    • Find the total missing values and fill all the missing values with by using respective method - Advanced
    • Using RFE select 6 best features - Advanced
    • Split the dataset into Train-Test set and apply multiple logistic regression and find the accuracy over test
    and train set - Intermediate


In [5]:
df = pd.read_csv("marketing.csv", 
                 names=['Sex',
                        'HouseholdMembers',
                        'MaritalStatus',
                        'Under18',
                        'Age',
                        'HouseholdStatus',
                        'Education',
                        'TypeOfHome',
                        'Occupation',
                        'EthnicClass',
                        'YearslnSf',
                        'Language',
                        'Duallncome',
                        'Income'], header=None)
df.head()

Unnamed: 0,Sex,HouseholdMembers,MaritalStatus,Under18,Age,HouseholdStatus,Education,TypeOfHome,Occupation,EthnicClass,YearslnSf,Language,Duallncome,Income
0,2,1.0,5,4.0,5.0,5.0,3,3.0,0,1.0,1.0,7.0,,9
1,1,1.0,5,5.0,5.0,5.0,3,5.0,2,1.0,1.0,7.0,1.0,9
2,2,1.0,3,5.0,1.0,5.0,2,3.0,1,2.0,3.0,7.0,1.0,9
3,2,5.0,1,2.0,6.0,5.0,1,4.0,2,3.0,1.0,7.0,1.0,1
4,2,5.0,1,2.0,6.0,3.0,1,4.0,2,3.0,1.0,7.0,1.0,1


In [18]:
df['Age'] = df['Age']*10
df.head()

Unnamed: 0,Sex,HouseholdMembers,MaritalStatus,Under18,Age,HouseholdStatus,Education,TypeOfHome,Occupation,EthnicClass,YearslnSf,Language,Duallncome,Income
0,2,1.0,5,8.0,50.0,5.0,3,3.0,0,1.0,1.0,7.0,,90000
1,1,1.0,5,10.0,50.0,5.0,3,5.0,2,1.0,1.0,7.0,1.0,90000
2,2,1.0,3,10.0,10.0,5.0,2,3.0,1,2.0,3.0,7.0,1.0,90000
3,2,5.0,1,4.0,60.0,5.0,1,4.0,2,3.0,1.0,7.0,1.0,10000
4,2,5.0,1,4.0,60.0,3.0,1,4.0,2,3.0,1.0,7.0,1.0,10000


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8993 entries, 0 to 8992
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Sex               8993 non-null   int64  
 1   HouseholdMembers  8833 non-null   float64
 2   MaritalStatus     8993 non-null   int64  
 3   Under18           8907 non-null   float64
 4   Age               8857 non-null   float64
 5   HouseholdStatus   8080 non-null   float64
 6   Education         8993 non-null   int64  
 7   TypeOfHome        8618 non-null   float64
 8   Occupation        8993 non-null   int64  
 9   EthnicClass       8753 non-null   float64
 10  YearslnSf         8636 non-null   float64
 11  Language          8925 non-null   float64
 12  Duallncome        8634 non-null   float64
 13  Income            8993 non-null   int64  
dtypes: float64(9), int64(5)
memory usage: 983.7 KB


In [16]:
df.isna().sum()

Sex                   0
HouseholdMembers    160
MaritalStatus         0
Under18              86
Age                 136
HouseholdStatus     913
Education             0
TypeOfHome          375
Occupation            0
EthnicClass         240
YearslnSf           357
Language             68
Duallncome          359
Income                0
dtype: int64

In [14]:
mean_imputer = SimpleImputer(strategy='mean')
mode_imputer = SimpleImputer(strategy='mode')

In [19]:
numerical_null_columns = ['Age', 'YearslnSf']
categorical_null_columns = ['HouseholdMembers',
                            'Under18',
                            'HouseholdStatus' , 
                            'TypeOfHome', 
                            'EthnicClass', 
                            'YearslnSf',
                           'Language',
                           'Duallncome']

In [22]:
for col in numerical_null_columns:
    mean_imputer = SimpleImputer(strategy='mean')
    df[col] = mean_imputer.fit_transform(np.array(df[col]).reshape(-1, 1))
    
for col in categorical_null_columns:
    mode_imputer = SimpleImputer(strategy='most_frequent')
    df[col] = mode_imputer.fit_transform(np.array(df[col]).reshape(-1, 1))

df.isna().sum()

Sex                 0
HouseholdMembers    0
MaritalStatus       0
Under18             0
Age                 0
HouseholdStatus     0
Education           0
TypeOfHome          0
Occupation          0
EthnicClass         0
YearslnSf           0
Language            0
Duallncome          0
Income              0
dtype: int64

In [23]:
df.head()

Unnamed: 0,Sex,HouseholdMembers,MaritalStatus,Under18,Age,HouseholdStatus,Education,TypeOfHome,Occupation,EthnicClass,YearslnSf,Language,Duallncome,Income
0,2,1.0,5,8.0,50.0,5.0,3,3.0,0,1.0,1.0,7.0,1.0,90000
1,1,1.0,5,10.0,50.0,5.0,3,5.0,2,1.0,1.0,7.0,1.0,90000
2,2,1.0,3,10.0,10.0,5.0,2,3.0,1,2.0,3.0,7.0,1.0,90000
3,2,5.0,1,4.0,60.0,5.0,1,4.0,2,3.0,1.0,7.0,1.0,10000
4,2,5.0,1,4.0,60.0,3.0,1,4.0,2,3.0,1.0,7.0,1.0,10000


In [27]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

model = LinearRegression()
rfe = RFE(model, 6)
rfe = rfe.fit(X, y)

rfe.support_

array([ True,  True, False,  True, False, False, False, False, False,
        True,  True, False,  True])

In [28]:
X.columns[rfe.support_]

Index(['Sex', 'HouseholdMembers', 'Under18', 'EthnicClass', 'YearslnSf',
       'Duallncome'],
      dtype='object')

In [29]:
rfe.ranking_

array([1, 1, 6, 1, 7, 4, 5, 2, 3, 1, 1, 8, 1])

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LogisticRegression(solver='sag', multi_class='multinomial', C=10, max_iter=1000)
model.fit(X_train, y_train)

LogisticRegression(C=10, max_iter=1000, multi_class='multinomial', solver='sag')

In [32]:
print(accuracy_score(y_train, model.predict(X_train)))
print(accuracy_score(y_test, model.predict(X_test)))

0.33248610007942814
0.33432171979243885


In [33]:
from scipy.stats import t

In [None]:
t.