 # Importing Libraries and Dataset

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.DataFrame(pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/train.csv'))

In [None]:
df.head()

#  Data Analysis


In [None]:
df.describe()

In [None]:
df.shape

In [None]:
# Check which column contains object data types
df.dtypes

In [None]:
# Check which column contains null values
df.isnull().any()

Now we'll find the correlation between the attributes and target variable using heatmap

Now, if we apply heatmap to the whole dataset, it'll create heatmap of columns with dtype int or float (excluding object)

In [None]:
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(df.corr(), annot = True)

In [None]:
# Simple function to get the name of most correlated attributes
def get_correlated_col(cor_data, threshold):
    #cor_data to be column along which corelation to be measured
    #threshold be the value above which of correlation to be considered
    feature=[]
    value=[]
    
    for i, index in enumerate(cor_data.index):
        if abs(cor_data[index]) > threshold:
            feature.append(index)
            value.append(cor_data[index])
    
    df = pd.DataFrame(data=value, index = feature, columns= ['corr value'])
    return df

In [None]:
top_correlated_value = get_correlated_col(df.corr()['target'], 0.02)
top_correlated_value

In [None]:
final_df = df[top_correlated_value.index]
final_df

Now lets work with columns with dtype of object

In [None]:
cat_df = df.select_dtypes(include=['object'])
cat_df.head()

In [None]:
#Changing categorical values(Object) into numerical value using Encoder
from sklearn.preprocessing import LabelEncoder 

cat_col = cat_df.columns
for i in cat_col:
    enc = LabelEncoder()
    cat_df[i] = enc.fit_transform(cat_df[i].astype('str'))

In [None]:
cat_df.head()

In [None]:
# Adding target column in the cat_df to get the correlation
cat_df['target'] = df['target'] 

In [None]:
cat_df.head()

In [None]:
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(cat_df.corr(), annot = True)

In [None]:
top_correlated_value = get_correlated_col(cat_df.corr()['target'], 0.055)
top_correlated_value

In [None]:
#Adding correlated attribute to final_df
final_df['cat1'] = cat_df['cat1']
final_df['cat2'] = cat_df['cat2']
final_df['cat3'] = cat_df['cat3']


In [None]:
final_df.head()

In [None]:
#spliting attributes and target 
X = final_df.drop(['target'], axis=1)
y = final_df['target']

In [None]:
X.head()

In [None]:
#Now we'll transform features by scaling each feature to a given range (0 to 1)

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X.head()

# Fitting the Model

In [None]:
from sklearn.linear_model import LinearRegression

LinearReg = LinearRegression()
LinearReg.fit(X, y)

# Prediction 

In [None]:
df_test = pd.DataFrame(pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/test.csv'))

In [None]:
df_test.head()

In [None]:
Id = df_test['id']
test_df_columns = final_df.columns.drop('target')

In [None]:
test_df_columns

In [None]:
test_df = df_test[test_df_columns]

In [None]:
test_df

In [None]:
#Changing categorical values(Object) into numerical value using Encoder
from sklearn.preprocessing import LabelEncoder 

cat_col = ['cat1','cat2','cat3']
for i in cat_col:
    enc = LabelEncoder()
    test_df[i] = enc.fit_transform(test_df[i].astype('str'))

In [None]:
test_df.head()

In [None]:
#Now we'll transform features by scaling each feature to a given range (0 to 1)

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
test_df = pd.DataFrame(scaler.fit_transform(test_df), columns=test_df.columns)
test_df.head()

In [None]:
#Prediction
y_pred = LinearReg.predict(test_df)

In [None]:
y_pred

In [None]:
final_df = pd.DataFrame({'id': Id, 'target': y_pred.flatten()})

In [None]:
final_df.head()

In [None]:
final_df.to_csv('./submission.csv', index=False)