# Mercedes-Benz Greener Manufacturing

## Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline



from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Loading data

In [None]:
train=pd.read_csv('../input/mercedes-benz-greener-manufacturing/train.csv.zip')
test=pd.read_csv('../input/mercedes-benz-greener-manufacturing/test.csv.zip')

## Data analys

In [None]:
train

Let's start with target variable

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(121)
sns.distplot(train.y.values, bins=50, color='b')
plt.title('Disribution of target variable',fontsize=15)
plt.xlabel('Seconds');

plt.subplot(122)
sns.boxplot(train.y.values, color='b')
plt.title('Disribution of target variable',fontsize=15)
plt.xlabel('Seconds');

In [None]:
plt.figure(figsize=(15,5))
plt.scatter(range(train.shape[0]),np.sort(train.y.values),color='b')
plt.title("Disribution of target variable")
plt.ylabel("Seconds")
plt.xlabel('Number of cars');

In [None]:
train.y.describe()

The target variable has a standard distribution of about 72 to 140 seconds. there are outliers starting from 150 seconds and we can remove them from the training set.

In [None]:
train=train[train.y<140]

How target variable depends on ID

In [None]:
plt.figure(figsize=(15,5))
sns.regplot(x='ID', y='y', data=train,line_kws={'color': 'red'})
plt.ylabel("Seconds")
plt.xlabel('Cars ID');

In [None]:
train.dtypes.value_counts()

In [None]:
object_features=train.dtypes[train.dtypes=='object'].index
int_features=train.dtypes[train.dtypes=='int64'].index

In [None]:
train.describe()

### Missing values

In [None]:
train.isnull().sum()[train.isnull().sum()>0]

In [None]:
test.isnull().sum()[test.isnull().sum()>0]

There is no missing values in train and test set

In [None]:
train[int_features].max().value_counts()

We can see that 368 features are boolean. 12 of them are zero cells. And ID maximum is 8417

In [None]:
non_var_features=[]
for i in int_features:
    if train[i].var()==0:
        non_var_features.append(i)
        print(i,train[i].var())

We can remove this variables from the analysis, because they don't affect time that the car took to pass

In [None]:
train=train.drop(non_var_features, axis=1)

In [None]:
for i in object_features:
    print(i, train[i].unique())
    print(i, test[i].unique())

In [None]:
for i in object_features:
    le=LabelEncoder()
    le.fit(list(train[i].values)+ list(test[i].values))
    train[i]=le.transform(list(train[i].values))
    test[i]=le.transform(list(test[i].values))

In [None]:
corr_matrix=train[train.columns[1:10]].corr()
corr_matrix

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(corr_matrix, annot = True, vmax=1,cmap= 'coolwarm',linewidths=3, linecolor='black')
plt.title('Correlation Matrix', fontsize=14);

There is no much dependence between categorial variables and target

In [None]:
num_corr=train.drop(columns=object_features, axis=1).corr()

We can see that some of numeric features have a direct correlation with others, therefore, in order to avoid multicollinearity, we can remove them.

In [None]:
train.T.duplicated().value_counts()

In [None]:
train=train.T.drop_duplicates().T

In [None]:
train.shape

## Modeling

In [None]:
X_train=train.drop('y',axis=1)
y_train=train.y
X_test=test[X_train.columns]

In [None]:
X_train, X_val, y_train, y_val=train_test_split(X_train, y_train, test_size=0.2, random_state=42)

### Linear Regression

In [None]:
lin_reg=LinearRegression().fit(X_train, y_train)
print('Validation ')
lin_reg.score(X_val, y_val)

In [None]:
lin_reg.score(X_val, y_val)

### Random Forest

In [None]:
rf_reg=RandomForestRegressor(n_estimators=200, max_depth=20, n_jobs=-1, verbose=1)
rf_reg.fit(X_train, y_train)
rf_reg.score(X_val, y_val)

In [None]:
sub=rf_reg.predict(X_test)

In [None]:
submission=pd.DataFrame({'ID':X_test.ID, 'y':sub})
submission.to_csv('Submission.csv', index=False)