In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import warnings

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns

warnings.filterwarnings('ignore')

# Load the Data

In [None]:
# import datasets
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
train_df.head(20)

In [None]:
train_df.claim.nunique()

In [None]:
train_df.shape

In [None]:
train_df.isnull().sum()

# Lets see data

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [None]:
missing_train_df = pd.DataFrame(train_df.isna().sum(axis=0))
missing_train_df = missing_train_df.drop(['id', 'claim']).reset_index()
missing_train_df.columns = ['feature', 'count']
missing_train_df['count_percent'] = missing_train_df['count']/train_df.shape[0]


missing_test_df = pd.DataFrame(test_df.isna().sum())
missing_test_df = missing_test_df.drop(['id']).reset_index()
missing_test_df.columns = ['feature', 'count']
missing_test_df['count_percent'] = missing_test_df['count']/test_df.shape[0]

In [None]:
missing_test_df

In [None]:
missing_train_row = train_df.drop(['id', 'claim'], axis=1).isna().sum(axis=1)
missing_train_feature_numbers = pd.DataFrame(missing_train_row.value_counts()/train_df.shape[0]).reset_index()
missing_train_feature_numbers.columns = ['no_of_feature', 'count_percent']

missing_test_row = test_df.drop(['id'], axis=1).isna().sum(axis=1)
missing_test_feature_numbers = pd.DataFrame(missing_test_row.value_counts()/test_df.shape[0]).reset_index()
missing_test_feature_numbers.columns = ['no_of_feature', 'count_percent']

In [None]:
missing_test_feature_numbers

# A lot of missing values. Lets see the correlation

Lets see why people are obsessed with the null counts

In [None]:
train_df['num_nulls'] = train_df.drop(['id', 'claim'], axis = 1).isna().sum(axis = 1)
test_df['num_nulls'] = test_df.drop(['id'], axis = 1).isna().sum(axis = 1)

In [None]:
train_df['num_nulls'].corr(train_df['claim'])

Damn!! thats a large correlation . Need to keep this factor. 

# Lets also look if we have imbalance case

In [None]:
train_df.claim.value_counts()

In [None]:
train_df

Good to go.Can optimize later

# So now the work remaining is the removal of null values. (Also a bit of Preprocessing) 
But we can't drop the rows owing to the large amount single null rows

In [None]:
%%time
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer,  KBinsDiscretizer
from sklearn.impute import SimpleImputer

features = [col for col in train_df.columns if col not in ['claim', 'id']]
pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median',missing_values=np.nan)),
        ("scaler", QuantileTransformer(n_quantiles=64,output_distribution='uniform')),
        ('bin', KBinsDiscretizer(n_bins=64, encode='ordinal',strategy='uniform'))
        ])
train_df[features] = pipe.fit_transform(train_df[features])
test_df[features] = pipe.transform(test_df[features])

In [None]:
test_df.isnull().sum()

In [None]:
train_df

In [None]:
target = train_df['claim'].copy()

In [None]:
train_df.drop(['num_nulls','claim','id'], inplace=True, axis=1)

In [None]:
test_df.drop(['num_nulls','id'], inplace=True, axis=1)

In [None]:
target

In [None]:
train_df

In [None]:
test_df

In [None]:
# outlier handling

In [None]:
# transformations

In [None]:
# standradization

In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer

# create the scaler object
scaler = StandardScaler()

# Same as previous -  we only fit the training data to scaler
scaler.fit(train_df)

train_scaled = scaler.transform(train_df)
test_scaled = scaler.transform(test_df)

standardized_df_train = pd.DataFrame(train_scaled, columns = train_df.columns)
standardized_df_test = pd.DataFrame(test_scaled, columns = test_df.columns)

standardized_df_train

In [None]:
# PCA
from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA()
pca.fit(standardized_df_train) 

X_train_pca = pca.transform(standardized_df_train)
X_test_pca = pca.transform(standardized_df_test)

In [None]:
explained_variance_ratio=pca.explained_variance_ratio_
explained_variance_ratio

In [None]:
arr=explained_variance_ratio 
sum = 0;  
for i in range(0, 110):    
   sum=sum+arr[i]   
print("Sum :" + str(sum));  

In [None]:
# not much of diamentions can be reduced

In [None]:
pca = PCA(n_components=110)
pca.fit(standardized_df_train) 

X_train_pca = pca.transform(standardized_df_train)
X_test_pca = pca.transform(standardized_df_test)

In [None]:
#Implementing Linear Regression
from sklearn import linear_model
lm = linear_model.LinearRegression()
model = lm.fit(X_train_pca,target)

In [None]:
# Fitting Polynomial Regression to the dataset
# from sklearn.preprocessing import PolynomialFeatures
# poly_reg = PolynomialFeatures(degree=4)
# X_poly = poly_reg.fit_transform(standardized_df_train)
# pol_reg = LinearRegression()
# model2=pol_reg.fit(X_poly, target)

In [None]:
predictions = lm.predict(X_test_pca)
y_hat = pd.DataFrame(predictions, columns=["predicted"])
print(y_hat.head(10)) #print predictions for first ten values

In [None]:
y_hat['predicted'] = y_hat['predicted'].round(decimals = 1)

In [None]:
y_hat

In [None]:
submission['claim'] = y_hat
submission.to_csv('submission.csv', index=False)