# Reference : https://www.kaggle.com/apryor6/santander-product-recommendation/detailed-cleaning-visualization-python/comments

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
limit_row = 6000000
train = pd.read_csv('../input/train_ver2.csv',nrows=limit_row)
unique_ids = train.ncodpers.unique()
unique_ids = unique_ids.sample(n=1e4)
train = train[train.ncodpers.isin(unique_ids)]

In [None]:
train.head()

In [None]:
train.isnull().any()

In [None]:
train.describe()

In [None]:
train['fecha_dato'] = pd.to_datetime(train['fecha_dato'],format='%Y-%m-%d')
train['fecha_alta'] = pd.to_datetime(train['fecha_alta'],format='%Y-%m-%d')
train['month'] = train['fecha_dato'].apply(lambda x:x.month)
train['age'] = pd.to_numeric(train['age'],errors='coerce')

In [None]:
train.isnull().any()

## Data cleaning

 

In [None]:
fig,ax = plt.subplots(figsize=(8,6))
sns.distplot(train['age'].dropna(),kde=False,ax=ax,color='#ffa726')
plt.title('Age Distribution')
plt.ylabel('Freq')
plt.xlabel('Age')

#### Let's separation the distribution and move the outlier to mean of the closest one

In [None]:
train.loc[train.age < 18,"age"]  = train.loc[(train.age >= 18) & (train.age <= 30),"age"].mean(skipna=True)
train.loc[train.age > 100,"age"] = train.loc[(train.age >= 30) & (train.age <= 100),"age"].mean(skipna=True)
train['age'].fillna(train['age'].mean(),inplace=True)
train['age'] = train['age'].astype(int)

In [None]:
sns.set_style('whitegrid')
fig,ax = plt.subplots(figsize=(8,6))
sns.distplot(train['age'],kde=False,ax=ax)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Freq')

#### ind_nuevo, which indicate whether a customer is new or not

In [None]:
train['ind_nuevo'].isnull().sum()

In [None]:
month_active = train.loc[train['ind_nuevo'].isnull(),:].groupby('ncodpers',sort=False).size()
month_active.max()

#### these are all new customer

In [None]:
train['ind_nuevo'].fillna(1,inplace=True)

In [None]:
train['antiguedad'] = pd.to_numeric(train['antiguedad'],errors='coerce')
np.sum(train['antiguedad'].isnull())

#### probably the same people that we just determined were new customer. Let's double check

In [None]:
train.loc[train['antiguedad'].isnull(),'ind_nuevo'].describe()

In [None]:
train.loc[train['antiguedad'].isnull(),'antiguedad'] = train['antiguedad'].min()
train.loc[train['antiguedad'] <0 , 'antiguedad']  = 0 

#### some entries haven't the date they joined the company

In [None]:
dates = train.loc[:,'fecha_alta'].sort_values().reset_index()
median_date = int(np.median(dates.index.values))
train.loc[train['fecha_alta'].isnull(),'fecha_alta'] = train.loc[median_date,'fecha_alta']
train['fecha_alta'].describe()

In [None]:
train['indrel'].value_counts()

In [None]:
train['indrel'].fillna(1,inplace=True)

#### tipodom doesn't seem to be useful, and the province code is not needed becaue the name of the province exists in nomprov.

In [None]:
train.drop(['tipodom','cod_prov'],axis=1,inplace=True)

In [None]:
train.isnull().any()

In [None]:
train['ind_actividad_cliente'].isnull().sum()

In [None]:
train['nomprov'].unique()

####   There was an issue with the unicode character ñ in A Coruña. I'll manually fix it

In [None]:
train.loc[train.nomprov=="CORU\xc3\x91A, A","nomprov"] = "CORUNA, A"

In [None]:
train.loc[train.nomprov.isnull(),'nomprov']='Unknown'

In [None]:
train.renta.isnull().sum()

In [None]:
train['ind_nomina_ult1'].value_counts()

In [None]:
train['ind_nomina_ult1'].fillna(0,inplace=True)
train['ind_nom_pens_ult1'].fillna(0,inplace=True)

####  I will fill the empty strings either with the most common value or create an unknown category based on what I think makes more sense.

In [None]:
train['indfall'].value_counts()

In [None]:
train['indfall'].fillna('N',inplace=True)

In [None]:
train['tiprel_1mes'].value_counts()

In [None]:
train['tiprel_1mes'].fillna('I',inplace=True)
train['tiprel_1mes'] = train['tiprel_1mes'].astype('category')

In [None]:
train['indrel_1mes'].value_counts()

In [None]:
map_dict = { 1.0  : "1",
            "1.0" : "1",
            "1"   : "1",
            "3.0" : "3",
            "P"   : "P",
            3.0   : "3",
            2.0   : "2",
            "3"   : "3",
            "2.0" : "2",
            "4.0" : "4",
            "4"   : "4",
            "2"   : "2"}

In [None]:
train['indrel_1mes'].fillna('P',inplace=True)
train['indrel_1mes'] = train['indrel_1mes'].apply(lambda x:map_dict.get(x,x))
train['indrel_1mes'] = train['indrel_1mes'].astype('category')

In [None]:
string_data = train.select_dtypes(include=['object'])
missing_columns = [col for col in string_data if string_data[col].isnull().any()]
del string_data

In [None]:
unknown_col = [col for col in missing_columns if col not in ["indfall","tiprel_1mes","indrel_1mes"]]
for col in unknown_col:
    train.loc[train[col].isnull(),col] = 'Unknown'

In [None]:
train.isnull().any()