In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
filepath_train = "../input/30-days-of-ml/train.csv"
filepath_test = "../input/30-days-of-ml/test.csv"

df_train = pd.read_csv(filepath_train, index_col=0)
df_test = pd.read_csv(filepath_test, index_col=0)



In [1]:
#df_test.reset_index(drop=True, inplace=True)
df_test.head()

In [1]:
df_train.info()

In [1]:
print("Shape of train data: " , df_train.shape)
print("Shape of test data: " , df_test.shape)

In [1]:
# seprating numerical and object columns from train and test data

cols_train_numeric = df_train.select_dtypes('float64').columns
cols_test_numeric = df_test.select_dtypes('float64').columns

cols_train_object = df_train.select_dtypes('object').columns
cols_test_object = df_test.select_dtypes('object').columns



In [1]:
import seaborn as sns
import matplotlib.pyplot as plt

In [1]:


# visualizing any missimg value in train data
plt.rcParams['figure.figsize'] = (12, 10)
sns.heatmap(df_train.isnull(), yticklabels=False, cmap='viridis')

# it is clear from graph that we have no missimg value

In [1]:
# checking missimg values for test data
sns.heatmap(df_test.isnull(), yticklabels=False, cmap='viridis')

checking if the test data contains any columns other than  the columns of train data, the print statment is empty, so that means there are no such columns

In [1]:
extra_cols = [col for col in df_test.columns if col not in df_train.columns ]
print(extra_cols)

Now checking skewness and kurtosis of the test and train dataframe

In [1]:
df_train.skew()

In [1]:
df_test.skew()

In [1]:
df_train.kurt()

In [1]:
df_test.kurt()

**Correlation**

In [1]:
plt.rcParams['figure.figsize'] = (14, 12)
sns.heatmap(df_train.corr(), annot=True, cmap="Blues")

**Checking cardinality of data**

In [1]:
different_categories = [ col for col in df_train.select_dtypes('object').columns if df_train[col].nunique() > 15]
print("columns with high cardinality for train data are: ", different_categories)

different_categories = [ col for col in df_test.select_dtypes('object').columns if df_test[col].nunique() > 15]
print("columns with high cardinality for test data are: ", different_categories)

checking if columns of test data contains any category/value that is not present in the columns of train data

In [1]:
result = []
for i in cols_test_object:
    train = set(df_train[i].unique())
    test = set(df_test[i].unique())
    
    result.append(test.issubset(train))
    
print(result)

***Splitting the train and testing data***

In [1]:
X_train = df_train.drop('target', axis=1)
y_train = df_train['target']

In [1]:
from sklearn.preprocessing import OneHotEncoder

In [1]:
encoder  = OneHotEncoder(handle_unknown='ignore', sparse=False)

X_train_cat_encoded = pd.DataFrame(encoder.fit_transform(X_train[cols_train_object.to_list()]))
X_train_cat_encoded.index = X_train.index
X_train_cat_encoded.shape

In [1]:
X_train.drop(cols_train_object.to_list(), axis=1, inplace=True)
X_train

In [1]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_encoded_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_train_encoded_scaled

In [1]:
X_test_cat_encoded = pd.DataFrame(encoder.transform(df_test[cols_test_object.to_list()]))
X_test_cat_encoded.index = df_test.index
X_test_cat_encoded.reset_index(drop=True, inplace=True)
X_test_cat_encoded

In [1]:
df_test.drop(cols_test_object.to_list(), axis=1, inplace=True)
df_test



In [1]:

scaler = StandardScaler()

X_test_encoded_scaled = pd.DataFrame(scaler.fit_transform(df_test), columns=df_test.columns)
X_test_encoded_scaled.shape

In [1]:
X_test_encoded_scaled.reset_index(drop=True, inplace=True)
X_test_cat_encoded.reset_index(drop=True, inplace=True)
X_test = pd.concat([X_test_cat_encoded, X_test_encoded_scaled],join='inner', axis=1)
X_test

Removing the categorical columns and concating numerical columns

In [1]:

X_train_encoded_scaled.reset_index(drop=True, inplace=True)
X_train_cat_encoded.reset_index(drop=True, inplace=True)
X_train = pd.concat([X_train_cat_encoded, X_train_encoded_scaled],join='inner', axis=1)
X_train

splitting the data into training and testing

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [1]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [1]:
lin_reg_model = LinearRegression()
lin_reg_model.fit(X_train, y_train)

predictions = lin_reg_model.predict(X_valid)

mse = mean_squared_error(y_valid, predictions)
print("Error for linear Regression: ", np.sqrt(mse))


In [1]:
decision_tree_model = DecisionTreeRegressor()
decision_tree_model.fit(X_train, y_train)

predictions = decision_tree_model.predict(X_valid)

mse = mean_squared_error(y_valid, predictions)
print("Error for Decision Tree: ", np.sqrt(mse))


In [1]:
model_forest = RandomForestRegressor() 
model_forest.fit(X_train, y_train)

predictions = model_forest.predict(X_valid)

mse = mean_squared_error(y_valid, predictions)
print("Error for random forest : ", np.sqrt(mse))

In [1]:
test_predictions = model_forest.predict(X_test)


In [1]:

X_test

In [1]:
# Save the predictions to a CSV file
output = pd.DataFrame({'id': df_test.index,
                       'target': test_predictions})
output.to_csv('submission.csv', index=False)

