In [None]:
import pandas as pd
df_train=pd.read_csv("../input/mercedesbenz-greener-manufacturing/train.csv")
df_test=pd.read_csv("../input/mercedesbenz-greener-manufacturing/test.csv")

In [None]:
df_train.head()

In [None]:
#getting the target varaible
import numpy as np
np.setdiff1d(df_train.columns,df_test.columns)

In [None]:
#joininng training and testing data ,so that cleaning can be done together
df=pd.concat([df_train,df_test],keys=['x','y'])
#df=df.drop(columns=['y'])
df.shape

In [None]:
#checking the missing value and the percentage
def miss_val(df):
   miss_col=df[[col for col in df.columns if df[col].isnull().any() == True]].isna().sum()
   con=miss_col/df[[col for col in df.columns if df[col].isnull().any() == True]].isna().count()
   return pd.concat([miss_col,con],keys=['missing_count','percentage'],axis=1).sort_values('percentage',ascending=False)
miss_val(df)

In [None]:
#checking the datatypes
print(df.select_dtypes('object').shape)
print('********')
print(df.select_dtypes('int64').shape)
print('********')
print(df.select_dtypes('float').shape)
print('********')
print(df.select_dtypes('bool').shape)

In [None]:
#encoding the cateogrical variables
from sklearn import preprocessing
en_label = preprocessing.LabelEncoder()

for i in df.select_dtypes('object'):
    df[i]= en_label.fit_transform(df[i])

In [None]:
#Remove Features with Zero Variance
from sklearn.feature_selection import VarianceThreshold
#threshold_n=0.05
sel = VarianceThreshold(threshold=.001)
sel_var=sel.fit_transform(df)
df1=df[df.columns[sel.get_support(indices=True)]] 

In [None]:
#Removing Features which are Highly Correlated
def correlation(dataset, threshold):
    col_corr = set() # Set of all names of correlated columns
    corr_matrix = dataset.corr() # Correlation Matrix
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:    # we are interested in absolute coeff value
                    
                colname = corr_matrix.columns[i] # getting the name of columns
                col_corr.add(colname)
    return(col_corr)

features_no_output = df1.drop(columns=['y'])

corr_features = correlation(features_no_output,0.85) # Setting Threshold as features having correlation above 85%
print("\n")
print("Correlated Features :\n ",corr_features)
print("\n")
print("No. of Features Correlated: ",len(corr_features))

In [None]:
#dropping the highly correlated values
for i in [df1]:
    df1.drop(columns=corr_features,inplace = True)

In [None]:
df1.shape

In [None]:
#seprating the original train and test data 
train_new=df1.loc["x"]
test_new=df1.loc["y"].drop(columns=['y'])
test_new.shape

In [None]:
#Getting x and y
import numpy as np
x=train_new.drop(columns=['y'])
y=np.log(train_new.y)

In [None]:
#trsin_test_split
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=10)

In [None]:
#normalising the dataset
from sklearn.preprocessing import StandardScaler
stdSC=StandardScaler()
x_train_std=stdSC.fit_transform(x_train)
x_test_std=stdSC.fit_transform(x_test)
x_final_std=stdSC.fit_transform(test_new)

In [None]:
x_train_std.shape

In [None]:
#Feature engineering
from sklearn.decomposition import PCA
PCAModel=PCA(210)
x_train_com=PCAModel.fit_transform(x_train_std)
x_test_com=PCAModel.transform(x_test_std)
test_new_com=PCAModel.transform(x_final_std)
PCAModel.explained_variance_
PCAModel.explained_variance_ratio_*100
import numpy as np
np.cumsum(PCAModel.explained_variance_ratio_*100)

### Model building

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score
rfr=RandomForestRegressor( max_depth=7, min_samples_leaf=3,n_estimators=100,
                       min_samples_split=2)
rfr.fit(x_train,y_train)
y_predictrfr = rfr.predict(x_train)

#here we can check our model score
print(rfr.score(x_train,y_train))
print(rfr.score(x_test,y_test))
print('RMSE:' + str(np.sqrt(mean_squared_error(y_test,rfr.predict(x_test)))))

#print(classification_report(y_test,prediction))


In [None]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(x_train_com,y_train)
print('LR train model:' + str( model.score(x_train_com,y_train)))
print('LR test model:' + str(model.score(x_test_com,y_test)))

In [None]:
from xgboost import XGBRegressor
XGB=XGBRegressor()
XGB.fit(x_train_std,y_train)

print('XGB train model:' + str( XGB.score(x_train_std,y_train)))
print('XGB test  model:' + str(XGB.score(x_test_std,y_test)))

In [None]:
from sklearn.linear_model import Lasso

lassoModel=Lasso(alpha=.0001)
lassoModel.fit(x_train_com,y_train)
print("Train Score (Linear):",lassoModel.score(x_train_com,y_train))
print("Test Score (Linear):",lassoModel.score(x_test_com,y_test))

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error,r2_score
RModel=Ridge(alpha=.0001)
RModel.fit(x_train_com,y_train)
print("Train Score (Linear):",RModel.score(x_train_com,y_train))
print("Test Score (Linear):",RModel.score(x_test_com,y_test))
print('RMSE:' + str(np.sqrt(mean_squared_error(y_test,RModel.predict(x_test_com)))))

In [None]:
#ElasticNet
from sklearn.linear_model import ElasticNet

EModel=ElasticNet(alpha=.0001)
EModel.fit(x_train_com,y_train)
print("Train Score (Linear):",EModel.score(x_train_com,y_train))
print("Test Score (Linear):",EModel.score(x_test_com,y_test))

### we can take the rfr model as our final model as the accuracy is 64%.

In [None]:
#prediction of the test_data
rfr.predict(test_new)

In [None]:

A=pd.DataFrame({'ID':test_new.ID,'y_test':rfr.predict(test_new)})

In [None]:
A['final_y'] = np.exp(A['y_test'])
A[['ID','final_y']]