In [None]:
import pandas as pd
import numpy as np
import sklearn as sk

pd.set_option('display.float_format', lambda x: '%.1f' % x)
# removes scientific notations 
np.set_printoptions(precision=8,suppress='True')


In [None]:
data = pd.read_csv('data1.csv')
# data.dtypes
# data.isnull()
data.isna().sum() # prints the count of null values in each column

In [None]:
data.shape # returns size
data.head()

In [None]:
indx = data.iloc[:,0:3]  # '.values'  returns in the form of multi di array

depy = pd.DataFrame(data.iloc[:,3])

**Handling Missing Values**

In [None]:
# Dropping records(rows) 
#data.dropna(inplace=True)
#data

**SimpleImputer**

In [None]:
# using imputer to add values in place of nan
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(indx.iloc[:,1:3])
indx.iloc[:,1:3] = imputer.transform(indx.iloc[:,1:3])
indx

**Categorical / Numerical Columns**

In [None]:
cat_col = [cname for cname in indx.columns 
           if indx[cname].dtype=='object']
cat_col

num_col = [cname for cname in indx.columns 
           if indx[cname].dtype in ['int','float']]
num_col

['Age', 'Salary']

**OneHotEncoder >> Categorical Columns with more than 2type of fields**

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
en_x = pd.DataFrame(ohe.fit_transform(indx[cat_col]).toarray())
en_x

In [None]:
indx.drop('City',axis=1,inplace=True)
indx = indx.join(en_x)
indx

**Label Encoder >> For binary categorical columns**

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
depy['Bonus'] = le.fit_transform(depy['Bonus'])
depy

**Simple plreprocessing technique**

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer

preprocessor = make_column_transformer(
    (StandardScaler(), features_num),
    (OneHotEncoder(), features_cat),
)

X_train = preprocessor.fit_transform(X_train)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

transformer_num = make_pipeline(
    SimpleImputer(strategy="constant"), # there are a few missing values
    StandardScaler(),
)
transformer_cat = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(handle_unknown='ignore'),
)

preprocessor = make_column_transformer(
    (transformer_num, features_num),
    (transformer_cat, features_cat),
)

**Train Test Split**

In [None]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(indx, depy,test_size=0.25,random_state=1)

**Outlier Detection and Removal**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sb

#sb.scatterplot(x='Salary',y='Age',hue='Bonus',data=data)
plt.hist(data['Age'],bins=15)

In [None]:
lower_limit=data['Age'].quantile(0.05) # 
lower_limit # Age which is < 24.6 is considered as lower limit

upper_limit=data['Age'].quantile(0.95)
upper_limit
#data[data['Age']>upper_limit]
#data[data['Age']<lower_limit]

In [None]:
data=data[(data['Age']>lower_limit) & (data['Age']<upper_limit)]
data.head()

**Ordinal Data to Numerical Data**
 > Column values such as [bad, avg, good, very good] is ordinal data

In [None]:
cricketers= {'Name':['Virat','Dhoni','Rohit','Dhawan','Hardik','Gayle','Bhuvi','Boult','Nabi','Bravo'],
            'Age':[26,32,28,23,25,30,29,30,21,22],
            'Rating':['Good','Best','Avg','Good','Avg','Best','Good','Good','Avg','Best']}
cricketers = pd.DataFrame(cricketers)
cricketers.head()

In [None]:
data_map = {'Good':3,
                'Best':5,
                'Avg':1}
data_map

cricketers['Rating'] = cricketers['Rating'].map(data_map)
cricketers.head()

**Data Binning >> Grouping of data into different groups**

In [None]:
def binningfun(col,cut_points,labels=None):
  max_val = col.max()
  min_val = col.min()
  break_points = [min_val] + cut_points + [max_val]
  if not labels:
    labels = range(len(cut_points)+1)
  col_bin = pd.cut(col,bins=break_points,labels=labels,include_lowest = True)
  return col_bin

In [None]:
cut_points = [25,30]
labels = ['Young','Senior','Super Senior']
cricketers['Experiance'] = binningfun(cricketers['Age'], cut_points, labels)
cricketers.head()

In [None]:
# OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
en_x  = pd.DataFrame(ohe.fit_transform(cricketers[['Experiance']]).toarray())
en_x.head()

cricketers.join(en_x)
cricketers.drop('Age',axis=1,inplace=True)
cricketers.head()

In [None]:
cricketers = cricketers.join(en_x)
cricketers.drop('Experiance',axis=1,inplace=True)
cricketers.head()

**Removal of Duplicate Records**

In [None]:
dupdata = pd.read_csv('Duplicate_preproc.csv')
dupdata

In [None]:
dupdata.drop_duplicates()

In [None]:
dupdata.drop_duplicates(subset=['Car']) # one car from each company will be present

**Merging** **Multiple** **Datasets**