In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


EDA : Detailed Analysis

# Imputing Missing Data
- Missing data refers to the absence of values for certain observations and is an unavoidable problem in most data sources. Scikit-learn does not support missing values as input, so we need to remove observations with missing data or transform them into permitted values. The act of replacing missing data with statistical estimates of missing values is called imputation.The goal of any imputation technique is to produce a complete dataset that can be used to train machine learning models. The choice of imputation technique we use will depend on whether the data is missing at random, the number of missing values, and the machine learning model we intend to use.
- Missing data is one of the trickiest parts of Data Cleaning for Machine Learning. We cannot just remove a piece of information unless we are aware of the importance with respect to our ultimate target variable and how it is related to it. E.g., imagine you are trying to check customer churn based on Customer Ratings, and it has missing values. If you drop variables, it could form an important part of the data and could play a crucial role in prediction, which forms an important part of real-world problems.

# Handling Missing Values

1. Deleting the column with missing data    :      df.dropna(axis=1)
2. Deleting the row with missing data       :      df.dropna(axis=0)
3. Impute missing values for continuous variable  :      df['X'].fillna(df['Y'].mean())
4. Imputation with an additional column     :      
5. Filling with a Regression Model
6. Impute missing values for categorical variable
7. Prediction of missing values
8. Imputation using Deep Learning Library — Datawig

Replacing characters with value

        varnames = ['A'+str(s) for s in range(1,17)]
        data.columns = varnames
        data = data.replace('?', np.nan)                  #Replacing with nan value
        data['A16'] = data['A16'].map({'+':1, '-':0})     #Recode the target variable as binary:
        random.seed(9001)                                 #Recode the target variable as binary:
        values = set([random.randint(0, len(data)) for p in range(0, 100)])
        for var in ['A3', 'A8', 'A9', 'A10']:
             data.loc[values, var] = np.nan

Removing observations with missing data

        data.isnull().mean().sort_values(ascending=True)
        data_cca = data.dropna()

Performing mean or median imputation

        import pandas as pd       #import pandas and the required functions and classes from scikitlearn and Feature-engine:
        from sklearn.model_selection import train_test_split
        from sklearn.impute import SimpleImputer
        from feature_engine.missing_data_imputers import MeanMedianImputer
        
        # In mean and median imputation, the mean or median values should be calculated using the variables in the train set;             therefore, let's separate the data
        X_train.isnull().mean()
        for var in ['A2', 'A3', 'A8', 'A11', 'A15']:
        value = X_train[var].median()
        X_train[var] = X_train[var].fillna(value)
        X_test[var] = X_test[var].fillna(value)

Implementing mode or frequent category imputation

        X_train, X_test, y_train, y_test = train_test_split(
        data.drop('A16', axis=1), data['A16'], test_size=0.3,
        random_state=0)
        for var in ['A4', 'A5', 'A6', 'A7']:
         value = X_train[var].mode()[0]
         X_train[var] = X_train[var].fillna(value)
         X_test[var] = X_test[var].fillna(value)
         
Replacing missing values with an arbitrary number

        X_train[['A2','A3', 'A8', 'A11']].max()
        for var in ['A2','A3', 'A8', 'A11']:
         X_train[var].fillna(99, inplace=True)
         X_test[var].fillna(99, inplace=True)
         
Capturing missing values in a bespoke category

        X_train, X_test, y_train, y_test = train_test_split(
         data.drop('A16', axis=1), data['A16'], test_size=0.3,
         random_state=0)      # let's separate the data into train and test sets while keeping only categorical variables:
        for var in ['A4', 'A5', 'A6', 'A7']: 
         X_train[var].fillna('Missing', inplace=True)
         X_test[var].fillna('Missing', inplace=True)


Replacing missing values with a value at the end of the distribution

        for var in ['A2', 'A3', 'A8', 'A11', 'A15']:
         IQR = X_train[var].quantile(0.75) - X_train[var].quantile(0.25)
         value = X_train[var].quantile(0.75) + 1.5 * IQR
         X_train[var] = X_train[var].fillna(value)
         X_test[var] = X_test[var].fillna(value)
         
         value = X_train[var].mean() + 3*X_train[var].std()    #If we want to use the Gaussian approximation instead of the IQR

proximity rule

Implementing random sample imputation

        X_train, X_test, y_train, y_test = train_test_split(
         data.drop('A16', axis=1), data['A16'], test_size=0.3,
         random_state=0)
        number_na = X_train['A2'].isnull().sum()
        random_sample_train = X_train['A2'].dropna().sample(number_na, random_state=0)

Adding a missing value indicator variable

        X_train, X_test, y_train, y_test = train_test_split(
         data.drop('A16', axis=1), data['A16'], test_size=0.3,
         random_state=0)     # Using NumPy, we'll add a missing indicator to the numerical and categorical variables in a loop:
        for var in ['A1', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8']:
         X_train[var + '_NA'] = np.where(X_train[var].isnull(), 1, 0)
         X_test[var + '_NA'] = np.where(X_test[var].isnull(), 1, 0)

Performing multivariate imputation by chained equations

        variables = ['A2','A3','A8', 'A11', 'A14', 'A15', 'A16']
        data = pd.read_csv('creditApprovalUCI.csv', usecols=variables)
        X_train, X_test, y_train, y_test = train_test_split(
         data.drop('A16', axis=1),data['A16' ], test_size=0.3,
         random_state=0)

Assembling an imputation pipeline with scikit-learn

        features_num_arbitrary = ['A3', 'A8']
        features_num_median = ['A2', 'A14']
        features_cat_frequent = ['A4', 'A5', 'A6', 'A7']
        features_cat_missing = ['A1', 'A9', 'A10']
        imputer_num_arbitrary = Pipeline(steps=[
         ('imputer', SimpleImputer(strategy='constant', fill_value=99)),])
        imputer_num_median = Pipeline(steps=[
         ('imputer', SimpleImputer(strategy='median')),])
        imputer_cat_frequent = Pipeline(steps=[
         ('imputer', SimpleImputer(strategy='most_frequent')),])
        imputer_cat_missing = Pipeline(steps=[
         ('imputer', SimpleImputer(strategy='constant',
        fill_value='Missing')),])

Assembling an imputation pipeline with Feature-engine

        X_train, X_test, y_train, y_test = train_test_split(
         data.drop('A16', axis=1), data['A16'], test_size=0.3,
         random_state=0)
        features_num_arbitrary = ['A3', 'A8']
        features_num_median = ['A2', 'A14']
        features_cat_frequent = ['A4', 'A5', 'A6', 'A7']
        features_cat_missing = ['A1', 'A9', 'A10']


Reading dataset

In [160]:
df_test = pd.read_csv('D:/FSDS-iNeuron/3.Resource/Dataset/data0/aug_test.csv')
df_test.head(2)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,32403,city_41,0.827,Male,Has relevent experience,Full time course,Graduate,STEM,9,<10,,1,21
1,9858,city_103,0.92,Female,Has relevent experience,no_enrollment,Graduate,STEM,5,,Pvt Ltd,1,98


In [161]:
df_train = pd.read_csv('D:/FSDS-iNeuron/3.Resource/Dataset/data0/aug_train.csv')
df_train.head(3)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0


In [228]:
df = pd.read_csv('D:/FSDS-iNeuron/3.Resource/Dataset/data0/hr_cleaned.csv')
df_cleaned = df
df_cleaned.head(3)

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,21,Undefined,,1,36,1.0
1,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,Small & Medium-org.,Pvt Ltd,5,47,0.0
2,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,Undefined,,0,83,0.0


In [163]:
df_imputed = pd.read_csv('D:/FSDS-iNeuron/3.Resource/Dataset/data0/hr_imputed.csv')
df_imputed.head(3)

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,21,Undefined,Pvt Ltd,1,36,1.0
1,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,Small & Medium-org.,Pvt Ltd,5,47,0.0
2,city_21,0.624,Undefined,No relevent experience,Full time course,Graduate,STEM,5,Undefined,Pvt Ltd,0,83,0.0


In [164]:
df_processed = pd.read_csv('D:/FSDS-iNeuron/3.Resource/Dataset/data0/hr_preprocessed.csv')
df_processed.head(4)

Unnamed: 0,city,city_development_index,experience,company_size,last_new_job,training_hours,target,gender Female,gender Male,gender Other,...,major_discipline Humanities,major_discipline No Major,major_discipline Other,major_discipline STEM,company_type Early Stage Startup,company_type Funded Startup,company_type NGO,company_type Other,company_type Public Sector,company_type Pvt Ltd
0,103,-0.04413,10.271582,2,0.693147,4.82124,1.0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
1,40,-0.062109,8.340147,1,1.791759,5.305129,0.0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
2,21,-0.065823,4.060962,2,0.0,6.411223,0.0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,115,-0.061446,-1.017357,2,0.0,5.49424,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [167]:
df_cleaned.columns

Index(['city', 'city_development_index', 'gender', 'relevent_experience',
       'enrolled_university', 'education_level', 'major_discipline',
       'experience', 'company_size', 'company_type', 'last_new_job',
       'training_hours', 'target'],
      dtype='object')

In [198]:
df_cleaned.columns

Index(['city', 'city_development_index', 'gender', 'relevent_experience',
       'enrolled_university', 'education_level', 'major_discipline',
       'experience', 'company_size', 'company_type', 'last_new_job',
       'training_hours', 'target'],
      dtype='object')

In [199]:
df_cleaned['major_discipline'].value_counts()

STEM               14492
Humanities           669
Other                381
Business Degree      327
Arts                 253
No Major             223
Name: major_discipline, dtype: int64

# Separating Categorical variable and Numerical Variable : df_test

Categorial
   - Nominal (Categorical Variable/ The variable which has no numeric value)
       - gender
       - relevent_experience
       - enrolled_university
       - city
       - major_discipline
       - company_type
   - Ordinal (Order matters but value doesn't)
       - education_level
       
Numerical

   - Continuous
        - city_development_index
        - experience
        - last_new_job
        - training_hours
        - 
   - Discrete
        - city
        - company_size
        - target
        - enrollee_id

Segragating columns

In [213]:
categorical_col = [feature for feature in df_cleaned.columns if (df_cleaned[feature].dtypes == 'O')]
categorical_col

['city',
 'gender',
 'relevent_experience',
 'enrolled_university',
 'education_level',
 'major_discipline',
 'company_size',
 'company_type']

In [215]:
numerical_col = [feature for feature in df_cleaned.columns if (df_cleaned[feature].dtypes != 'object')]
numerical_col

['city_development_index',
 'experience',
 'last_new_job',
 'training_hours',
 'target']

# Finding null values

In [200]:
df_cleaned.isnull().sum()

city                         0
city_development_index       0
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                   0
company_size                 0
company_type              6140
last_new_job                 0
training_hours               0
target                       0
dtype: int64

In [207]:
df_cleaned.isnull().sum().sum()

14307

To check duplicate records

In [202]:
df_cleaned.duplicated().sum()

104

In [205]:
[features for features in df_cleaned.columns if df_cleaned[features].isnull().sum()>0]

['gender',
 'enrolled_university',
 'education_level',
 'major_discipline',
 'company_type']

# Handling null value : df_test

Method 1: Performing Mean Imputation

Mean imputation is one such method in which the mean of the observed values for each variable is computed and the missing values for that variable are imputed by this mean.

In [216]:
for var in numerical_col:
    value = df_cleaned[var].mean()
    df_cleaned[var] = df_cleaned[var].fillna(value)

In [220]:
df_cleaned[numerical_col].isnull().sum().sum()

0

In [222]:
df_cleaned[numerical_col].isnull().mean()

city_development_index    0.0
experience                0.0
last_new_job              0.0
training_hours            0.0
target                    0.0
dtype: float64

Method 2: Imputing Missing Values by mean using scikit-learn

In [223]:
import sklearn
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')

In [229]:
df1 = df
df1.head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,21,Undefined,,1,36,1.0
1,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,Small & Medium-org.,Pvt Ltd,5,47,0.0
2,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,Undefined,,0,83,0.0
3,city_115,0.789,,No relevent experience,,Graduate,Business Degree,0,Undefined,Pvt Ltd,0,52,1.0
4,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,21,Small & Medium-org.,Funded Startup,4,8,0.0


Segragating categorical and numerical variable

In [230]:
num_col = [fea for fea in df1.columns if (df1[fea].dtype != 'O' and df1[fea].dtype != int)]
num_col

['city_development_index',
 'experience',
 'last_new_job',
 'training_hours',
 'target']

In [232]:
X = df1[num_col]
X

Unnamed: 0,city_development_index,experience,last_new_job,training_hours,target
0,0.920,21,1,36,1.0
1,0.776,15,5,47,0.0
2,0.624,5,0,83,0.0
3,0.789,0,0,52,1.0
4,0.767,21,4,8,0.0
...,...,...,...,...,...
19153,0.878,14,1,42,1.0
19154,0.920,14,4,52,1.0
19155,0.920,21,4,44,0.0
19156,0.802,0,2,97,0.0


In [233]:
imputer.fit(X)

SimpleImputer()

In [234]:
X

Unnamed: 0,city_development_index,experience,last_new_job,training_hours,target
0,0.920,21,1,36,1.0
1,0.776,15,5,47,0.0
2,0.624,5,0,83,0.0
3,0.789,0,0,52,1.0
4,0.767,21,4,8,0.0
...,...,...,...,...,...
19153,0.878,14,1,42,1.0
19154,0.920,14,4,52,1.0
19155,0.920,21,4,44,0.0
19156,0.802,0,2,97,0.0


In [235]:
X= imputer.transform(X)
X

array([[  0.92 ,  21.   ,   1.   ,  36.   ,   1.   ],
       [  0.776,  15.   ,   5.   ,  47.   ,   0.   ],
       [  0.624,   5.   ,   0.   ,  83.   ,   0.   ],
       ...,
       [  0.92 ,  21.   ,   4.   ,  44.   ,   0.   ],
       [  0.802,   0.   ,   2.   ,  97.   ,   0.   ],
       [  0.855,   2.   ,   1.   , 127.   ,   0.   ]])

In [236]:
# let showcase the imputed data in DataFrame by using pandas

result = pd.DataFrame(X,columns=['city_development_index',
 'experience',
 'last_new_job',
 'training_hours',
 'target'])
result

Unnamed: 0,city_development_index,experience,last_new_job,training_hours,target
0,0.920,21.0,1.0,36.0,1.0
1,0.776,15.0,5.0,47.0,0.0
2,0.624,5.0,0.0,83.0,0.0
3,0.789,0.0,0.0,52.0,1.0
4,0.767,21.0,4.0,8.0,0.0
...,...,...,...,...,...
19153,0.878,14.0,1.0,42.0,1.0
19154,0.920,14.0,4.0,52.0,1.0
19155,0.920,21.0,4.0,44.0,0.0
19156,0.802,0.0,2.0,97.0,0.0


# Method 3: Replacing missing values in categorical features by the feature’s mode.

In [318]:
df_2 = df_cleaned

In [319]:
categorical_col = [features for features in df_2.columns if df_2[features].dtypes == 'object']
categorical_col

['city',
 'gender',
 'relevent_experience',
 'enrolled_university',
 'education_level',
 'major_discipline',
 'company_size',
 'company_type']

In [320]:
x = df_2[categorical_col]
x

Unnamed: 0,city,gender,relevent_experience,enrolled_university,education_level,major_discipline,company_size,company_type
0,city_103,Male,Has relevent experience,no_enrollment,Graduate,STEM,Undefined,
1,city_40,Male,No relevent experience,no_enrollment,Graduate,STEM,Small & Medium-org.,Pvt Ltd
2,city_21,,No relevent experience,Full time course,Graduate,STEM,Undefined,
3,city_115,,No relevent experience,,Graduate,Business Degree,Undefined,Pvt Ltd
4,city_162,Male,Has relevent experience,no_enrollment,Masters,STEM,Small & Medium-org.,Funded Startup
...,...,...,...,...,...,...,...,...
19153,city_173,Male,No relevent experience,no_enrollment,Graduate,Humanities,Undefined,
19154,city_103,Male,Has relevent experience,no_enrollment,Graduate,STEM,Undefined,
19155,city_103,Male,Has relevent experience,no_enrollment,Graduate,STEM,Small & Medium-org.,Pvt Ltd
19156,city_65,Male,Has relevent experience,no_enrollment,High School,,Small & Medium-org.,Pvt Ltd


In [321]:
x.isnull().sum()

city                      0
gender                 4508
relevent_experience       0
enrolled_university     386
education_level         460
major_discipline       2813
company_size              0
company_type           6140
dtype: int64

In [322]:
value = x['gender'].mode()[0]

In [323]:
import warnings
warnings. simplefilter("ignore")

In [324]:
x['gender'] = x['gender'].fillna(value)

In [325]:
x['enrolled_university'] = x['enrolled_university'].fillna(x['enrolled_university'].mode()[0])

In [326]:
x['major_discipline'] = x['major_discipline'].fillna(x['major_discipline'].mode()[0])

In [327]:
x['company_type'] = x['company_type'].fillna(x['company_type'].mode()[0])

In [328]:
x['education_level'] = x['education_level'].fillna(x['education_level'].mode()[0])

In [262]:
x.isnull().sum()

city                   0
gender                 0
relevent_experience    0
enrolled_university    0
education_level        0
major_discipline       0
company_size           0
company_type           0
dtype: int64

# Method 4:Replacing Missing Values with an arbitary number

- Arbitrary number imputation consists of replacing missing values with an arbitrary value. Some commonly used values include 999, 9999, or -1 for positive distributions. This method is suitable for numerical variables. A similar method for categorical variables will be discussed in the Capturing missing values in a bespoke category recipe.

- When replacing missing values with an arbitrary number, we need to be careful not to select a value close to the mean or the median, or any other common value of the distribution.

In [341]:
df_3 = pd.read_csv('D:/FSDS-iNeuron/3.Resource/Dataset/data1/Travel.csv')

In [342]:
numerical_col = [feature for feature in df_3.columns if df_3[feature].dtypes !='object' and df_3[feature].dtypes != 'int']
numerical_col

['CustomerID',
 'ProdTaken',
 'Age',
 'CityTier',
 'DurationOfPitch',
 'NumberOfPersonVisiting',
 'NumberOfFollowups',
 'PreferredPropertyStar',
 'NumberOfTrips',
 'Passport',
 'PitchSatisfactionScore',
 'OwnCar',
 'NumberOfChildrenVisiting',
 'MonthlyIncome']

In [343]:
x = df_3[numerical_col]
x

Unnamed: 0,CustomerID,ProdTaken,Age,CityTier,DurationOfPitch,NumberOfPersonVisiting,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,MonthlyIncome
0,200000,1,41.0,3,6.0,3,3.0,3.0,1.0,1,2,1,0.0,20993.0
1,200001,0,49.0,1,14.0,3,4.0,4.0,2.0,0,3,1,2.0,20130.0
2,200002,1,37.0,1,8.0,3,4.0,3.0,7.0,1,3,0,0.0,17090.0
3,200003,0,33.0,1,9.0,2,3.0,3.0,2.0,1,5,1,1.0,17909.0
4,200004,0,,1,8.0,2,3.0,4.0,1.0,0,5,1,0.0,18468.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4883,204883,1,49.0,3,9.0,3,5.0,4.0,2.0,1,1,1,1.0,26576.0
4884,204884,1,28.0,1,31.0,4,5.0,3.0,3.0,1,3,1,2.0,21212.0
4885,204885,1,52.0,3,17.0,4,4.0,4.0,7.0,0,1,1,3.0,31820.0
4886,204886,1,19.0,3,16.0,3,4.0,3.0,3.0,0,5,0,2.0,20289.0


In [344]:
x

Unnamed: 0,CustomerID,ProdTaken,Age,CityTier,DurationOfPitch,NumberOfPersonVisiting,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,MonthlyIncome
0,200000,1,41.0,3,6.0,3,3.0,3.0,1.0,1,2,1,0.0,20993.0
1,200001,0,49.0,1,14.0,3,4.0,4.0,2.0,0,3,1,2.0,20130.0
2,200002,1,37.0,1,8.0,3,4.0,3.0,7.0,1,3,0,0.0,17090.0
3,200003,0,33.0,1,9.0,2,3.0,3.0,2.0,1,5,1,1.0,17909.0
4,200004,0,,1,8.0,2,3.0,4.0,1.0,0,5,1,0.0,18468.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4883,204883,1,49.0,3,9.0,3,5.0,4.0,2.0,1,1,1,1.0,26576.0
4884,204884,1,28.0,1,31.0,4,5.0,3.0,3.0,1,3,1,2.0,21212.0
4885,204885,1,52.0,3,17.0,4,4.0,4.0,7.0,0,1,1,3.0,31820.0
4886,204886,1,19.0,3,16.0,3,4.0,3.0,3.0,0,5,0,2.0,20289.0


In [345]:
x.max()

CustomerID                  204887.0
ProdTaken                        1.0
Age                             61.0
CityTier                         3.0
DurationOfPitch                127.0
NumberOfPersonVisiting           5.0
NumberOfFollowups                6.0
PreferredPropertyStar            5.0
NumberOfTrips                   22.0
Passport                         1.0
PitchSatisfactionScore           5.0
OwnCar                           1.0
NumberOfChildrenVisiting         3.0
MonthlyIncome                98678.0
dtype: float64

In [346]:
x.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
CityTier                      0
DurationOfPitch             251
NumberOfPersonVisiting        0
NumberOfFollowups            45
PreferredPropertyStar        26
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
MonthlyIncome               233
dtype: int64

In [347]:
x['Age'].fillna(70, inplace=True)   # here we are replacing null  by  arbitrary value number 70 for age,
                                    #as it is greater than max age

In [348]:
x.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                           0
CityTier                      0
DurationOfPitch             251
NumberOfPersonVisiting        0
NumberOfFollowups            45
PreferredPropertyStar        26
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
MonthlyIncome               233
dtype: int64

In [349]:
# By using sklearn

In [350]:
imputer = SimpleImputer(strategy='constrant', fill_value=70)
data = pd.read_csv('D:/FSDS-iNeuron/3.Resource/Dataset/data1/Travel.csv')
x_1 = data['Age'].to_numpy()      # converting into numpy array
x_1 = x_1.reshape(-1,1)

In [351]:
imputer.fit(x_1)

ValueError: Can only use these strategies: ['mean', 'median', 'most_frequent', 'constant']  got strategy=constrant

In [352]:
result_1 =pd.DataFrame(x_1,columns=['Age']) #  converting bacto to datframe
result_1.isnull().sum() # checking after imputation

Age    226
dtype: int64

In [353]:
result_1

Unnamed: 0,Age
0,41.0
1,49.0
2,37.0
3,33.0
4,
...,...
4883,49.0
4884,28.0
4885,52.0
4886,19.0
