In [None]:
def print_df(df):
    print(df.shape)
    print(df.head())

# Set up

Creating new conda environment and installing nessary libraries

*pip install -r /path/requirements.txt



## Importing nessary libraries

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.compose import ColumnTransformer

import missingno as msno

%matplotlib inline

# To visualise all the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

import pickle

In [None]:
# Importing data set from csv file & printing first 5 data point

In [None]:
dataset_filename = "Mixed_dataset.csv"
target_feature = 'SalePrice'
final_model_name = 'LinearRegression_model.sav'

In [None]:
dataset = pd.read_csv(dataset_filename)
print_df(dataset)

## Data Analysis

##### Dataset shape / dimension

In [None]:
dataset.shape

##### Exploring Dataset Column

In [None]:
dataset.columns

In [None]:
dataset.dtypes

In [None]:
dataset.info()

##### Checking missing / null values

In [None]:
# Checking is there any feature has null values
dataset.isnull().any()

In [None]:
#  Sum of null/missing values in the featue
dataset.isnull().sum()

In [None]:
## Check the percentage of nan/null/missing values present in each feature
features_with_na=[features for features in dataset.columns if dataset[features].isnull().sum()>1]

for feature in features_with_na:
    print(feature, round(dataset[feature].isnull().mean() * 100,2) ,  ' % missing values')

In [None]:
# Null data in the dataset
null_data = dataset[dataset.isnull().any(axis=1)]
print_df(null_data)

In [None]:
percentage_of_na_against_total_dataset = round((null_data.shape[0] / dataset.shape[0]) * 100 ,2)
print('percentage of na data against total dataset {} % '.format(percentage_of_na_against_total_dataset))

### Plotting null record columns as heatmap & Marrix

In [None]:
sns.heatmap(dataset.isnull(), cbar=False)

In [None]:
msno.matrix(dataset, labels=True)

### visualizing the featues which has null values 

In [None]:
# Visulaizing the na/null features

null_dataset = dataset[features_with_na]

cat_feature = get_categorical_features(null_dataset)
for feature in cat_feature:
    print("Visualization of " , feature)
    visualize_barplot(null_dataset,feature)
yr_feature = set(get_date_features(null_dataset))
num_yr_feature = set(get_numerical_features(null_dataset))
num_feature = num_yr_feature - yr_feature
for feature in num_feature:
    print("Visualization of " , feature)
    plot_numeric(dataset,feature)

### Visualization of categorical Variables

In [None]:
# Visualization of categorical Variables using matplotlib barplot
def visualize_barplot(df , variable,x_axis =None , y_axis=None,title=None):
    df[variable].value_counts().plot(kind='bar', title= title,grid=True)
    plt.xlabel(x_axis)
    plt.ylabel(y_axis)
    plt.show()

In [None]:
# Visualization of categorical Variables using sns countplot
def plot_sns_cat_feature(df , variable,x_axis =None , y_axis=None,title=None):
#     print(df[variable].value_counts())
    ax = sns.countplot(x=variable, data=df)
    plt.show()

In [None]:
# Visualization of categorical Variables using sns count plot with 2 variable
def plot_sns_cat_feature_with_target(df , variable, target_feature ,x_axis =None , y_axis=None,title=None):
    ax = sns.countplot(x=target_feature, hue=variable, data=df)
    plt.show()

### Visualization of Numerical Variables

In [None]:
# Visualization of of Numerical Variables using sns distplot 
def plot_sns_numeric_feature(df,variable ,x_axis =None , y_axis=None,title=None):
    sns.distplot(df[variable], kde=False, bins=15,grid=True)
    plt.show()

In [None]:
# Visualization of Numerical Variables using matplotlib histogram 
def plot_numeric(df,variable ,x_axis =None , y_axis=None,title=None):
    plt.hist(df[variable], bins = 20)
    plt.xlabel(x_axis)
    plt.ylabel(y_axis)
    plt.title(title)
    plt.show()

In [None]:
# TO visualize the features in histogram
dataset.hist(figsize=(15, 20))

In [None]:
# Fectching features list based on the feature data type

In [None]:
# list of categorical features
def get_categorical_features(df):
    categorical_features = [feature for feature in df.columns if df[feature].dtypes == 'O']
    print('Number of categorical features : ', len(categorical_features))
    return categorical_features

In [None]:
# list of numerical features
def get_numerical_features(df):
    numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'O']
    print('Number of numerical features : ', len(numerical_features))
    return numerical_features

In [None]:
# list of date features
def get_date_features(df):
    numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'O']
    year_feature = [feature for feature in numerical_features if 'Yr' in feature or 'yr' in feature or 'Year' in feature or 'year' in feature]
    print('Number of date features: ', len(year_feature))
    return year_feature

### visualizing the data to find the missing value based on the target value ( Only for Classification)

In [None]:
for feature in cat_feature:
    print("Visualization of " , feature , " vs " , target_feature)
#     plot_sns_cat_feature_with_target(null_dataset,feature,target_feature)
    plot_sns_cat_feature_with_target(dataset,feature,target_feature)

##### Finding unique values

In [None]:
dataset.nunique()

##### Checking dataset descriptive statistics

In [None]:
dataset.describe()

In [None]:
dataset.describe().T

In [None]:
# Including only string columns in a DataFrame
df.describe(include=["O"])

In [None]:
# Including only categorical columns from a DataFrame
df.describe(include=['category'])

In [None]:
dataset.skew()

In [None]:
dataset.kurtosis()

# Handling missing values

In [None]:
# Drop missing value

In [None]:
# Dropping feature
df.drop(feature, axis='columns', inplace=True)

In [None]:
# drop the missing values
df.dropna(axis=1)
df.dropna(inplace=True)
df.dropna(how='any')

In [None]:
# filling the missing values with mean , median , mode
df.fillna(df.mean(), inplace=True)
dataset[feature].fillna(dataset[feature].mean(),inplace=True)
dataset[feature].fillna(dataset[feature].median(),inplace=True)
dataset[feature].fillna(dataset[feature].mode(),inplace=True)

In [None]:
## Imputing missing values using Imputer

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
dataset_features_na_imputed = imputer.fit_transform(df[features_with_na])
dataset_features_na_imputed.head()

In [None]:
## Imputing missing values using IterativeImputer

In [None]:
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(X)
print(np.round(imp.transform(df[features_with_na])))

### Outliers

## Numerical features are in 2 types
### Discrete features
### Continous features

In [None]:
## Deriving Discrete features from numerical features
def get_discrete_features_from_numerical(df):
    yr_feature = set(get_date_features(df))
    num_yr_feature = set(get_numerical_features(df))
    num_feature = num_yr_feature - yr_feature    
    discrete_feature=[feature for feature in num_feature if len(df[feature].unique())<10 and feature not in year_feature]
    print("Discrete features Count: {}".format(len(discrete_feature)))
    return discrete_feature

In [None]:
## Deriving Continous features from numerical features
def get_continous_features_from_numerical(df):
    yr_feature = set(get_date_features(df))
    num_yr_feature = set(get_numerical_features(df))
    num_feature = num_yr_feature - yr_feature  
    discrete_feature = get_discrete_features_from_numerical(df)
    continuous_feature=[feature for feature in numerical_features if feature not in discrete_feature + year_feature]
    print("Continuous features Count {}".format(len(continuous_feature)))

# Visualizing the all features for feature engineering

In [None]:
# Visulaizing the na/null features

X_dataset = dataset.drop([target_feature], axis = 1)

cat_feature = get_categorical_features(X_dataset)
for feature in cat_feature:
    print("Visualization of " , feature)
    visualize_barplot(X_dataset,feature)
yr_feature = set(get_date_features(X_dataset))
num_yr_feature = set(get_numerical_features(X_dataset))
num_feature = num_yr_feature - yr_feature
for feature in num_feature:
    print("Visualization of " , feature)
    plot_numeric(X_dataset,feature)

In [None]:
# Analyse categorical variable 

In [None]:
for feature in categorical_features:
    print('The feature is {} and number of categories are {}'.format(feature,dataset[feature].nunique()))

In [None]:
# kde for bell curve
sns.histplot(df.column_name , kde =True)

In [None]:
# distrubution plot

sns.distplot(df.column_name)

In [None]:
# Visualize the mean for the feaure data
dataset.groupby(feature).mean()

In [None]:
# Converting target variable to numberic if its categorical feature with binary output

In [None]:
d = {'Y': 1, 'N': 0}
df[target_feature] = df[target_feature].map(d)

In [None]:
Y_dataset = dataset[target_feature]

In [None]:
df[target_feature].value_counts()

In [None]:
# plot features against target variable
plot_sns_cat_feature(Y_dataset,target_feature)

# Imbalanced dataset

In [None]:
# sampling with default data points and creating new dataset which will be used for model creation

In [None]:
class_0 = df[df.target_feature == 0]
class_1 = df[df.target_feature == 1]
print(class_0.shape , class_1.shape)

In [None]:
# Number sampling
class_0_sample = class_0.sample(n=452)
class_0_sample.shape

In [None]:
new_dataset = pd.concat([class_0_sample,class_1],axis=0)
new_dataset.shape

In [None]:
# down sampling

In [None]:
oversample = SMOTE()
X, y = oversample. (c_data[c_data.columns[1:]], c_data[c_data.columns[0]])
usampled_df = X.assign(Churn = y)

In [None]:
ros = RandomOverSampler(random_state=0)
X, y = ros.fit_resample(X, y);
output = pd.concat([X, y], axis=1)

In [None]:
# Converting continuous values in to discreate values

In [None]:
binarized_data = Binarizer().fit_transform(dataset[continuous_feature])
binarized_data.head()

In [None]:
# Finding the correlation b/w the dataset

In [None]:
plt.figure(figsize = (15,15))
sns.heatmap(dataset.corr(), fmt='.2f',annot = True, cmap = 'RdYlGn')
# ax = sns.heatmap(corr, xticklabels = corr.columns, yticklabels = corr.columns,linewidths = 0.2, cmap = 'YlGnBu', annot = True)
# sns.heatmap(corr , cbar = True , square = True , fmt = '.1f' , annot = True , annot_kws = { 'size' : 8} , cmap = 'Blues')


In [None]:
sns.pairplot(dataset, hue = target_feature)

## Encoding categorical data  

### Label Encoder

#### Label Encoder will replace every categorical variable with number. Useful for replacing yes by 1, no by 0.

In [None]:
lable_encoder = LabelEncoder()
cat_feature_encoded = lable_encoder.fit_transform(dataset['categorical_feature'])
cat_feature_encoded.head()

In [None]:
lable_encoder.classes_

## One hot encoder

#### One Hot Encoder will create a separate column for every variable and give a value of 1 where the variable is present

In [None]:
one_hot_encoder = OneHotEncoder(sparse=False)
encoded = one_hot_encoder.fit_transform(dataset['categorical_feature'])
encoded.head()

In [None]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

## Pandas funtion get_dummies

#### Same like One Hot Encoder, It will create a separate column for every variable and give a value of 1 where the variable is present

In [None]:
cat_encoded = pd.get_dummies(dataset['categorical_feature'])
cat_encoded.head()

## Feature Scaling

Its the one of the data trasformation technique to make data all in resanable rage for easy caculation

#### There are 3 most used ways to scale features. 
1. __Min Max Scaling__: 
Will scale the input to have minimum of 0 and maximum of 1. That is, it scales the data in the range of [0, 1] This is useful when the parameters have to be on same positive scale. But in this case, the outliers are lost. 
$$X_{norm} = \frac{X - X_{min}}{X_{max} - X_{min}}$$

2. __Standardization__:
Will scale the input to have mean of 0 and variance of 1. 
$$X_{stand} = \frac{X - \mu}{\sigma}$$

3. __Normalizing__: 
Will scale the input to make the norm of 1. For instance, for 3D data the 3 independent variables will lie on a unit Sphere. 

4. __Log Transformation__:
Taking the log of data after any of above transformation. 

5. __Decimal scaling__:
Converting data in to a dicimal for of largest data point

For most applications, Standardization is recommended. Min Max Scaling is recommended for Neural Networks. Normalizing is recommended when Clustering eg. KMeans. 

### Standardization

In [None]:
standard_scaler = StandardScaler()
standardized_dataset = standard_scaler.fit_transform(X)
standardized_dataset.head()

### Normalizing

In [None]:
normalizer = Normalizer()
normolized_dataset = normalizer.fit_transform(X)
normolized_dataset.head()

### MinMax Scaling

In [None]:
min_max_scaler = MinMaxScaler()
min_max_scaled_dataset = min_max_scaler.fit_transform(X)
min_max_scaled_dataset.head()

### Decimal Scaling

### Log Transform

In [None]:
ela0989540

# Feature Selection

In [None]:
### Apply Feature Selection
# first, I specify the Lasso Regression model, and I
# select a suitable alpha (equivalent of penalty).
# The bigger the alpha the less features that will be selected.

# Then I use the selectFromModel object from sklearn, which
# will select the features which coefficients are non-zero

feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0)) # remember to set the seed, the random state in this function
feature_sel_model.fit(X_train, y_train)


In [None]:
feature_sel_model.get_support()

In [None]:
# let's print the number of total and selected features

# this is how we can make a list of the selected features
selected_feat = X_train.columns[(feature_sel_model.get_support())]

# let's print some stats
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

In [None]:
selected_feat
X_train=X_train[selected_feat]

# Visualization

In [None]:
### Different plot design

### plt.plot(x,y, 'r+')
### plt.plot(x,y, 'g')
### plt.plot(x,y, 'g*')

In [None]:
def line_plot( x_feature , y_feature ,df ):
    plt.plot(df.x_feature , df.y_feature)
    plt.title( x_feature + ' vs ' + y_feature)
    plt.xlabel(x_feature)
    plt.ylabel(y_feature)
    plt.show()

In [None]:
def line_plot_2data( x_feature , y_feature , df ,df1= None):
    plt.plot(df.x_feature , df.y_feature)
    plt.plot(df1.x_feature , df1.y_feature)
    plt.title( x_feature + ' vs ' + y_feature)
    plt.xlabel(x_feature)
    plt.ylabel(y_feature)
    plt.show()

In [None]:
def bar_plot():    
    plt.bar(x,top_10_populated_countries.population / 10 **6 )
    plt.xlabel('country raking wise')
    plt.ylabel('population')
    plt.xticks(x,top_10_populated_countries.country , rotation = "vertical")
    plt.show()

In [None]:
def scatter_plot(x_feature , y_feature ,df):
    plt.scatter(df.x_feature , df.y_feature , 10 )
    plt.xlabel(x_feature)
    plt.ylabel(y_feature)
    plt.show()

In [None]:
def pie_plot(label,data):
    fig1 = plt.figure()
    ax = fig1.add_axes([0,0,1,1])  
    ax.pie(data,labels=label , autopct = '%1.1f%%' )
    plt.show()

In [None]:
df = shuffle(df)