In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing


train_set = pd.read_csv("/kaggle/input/mercedesbenz-greener-manufacturing/train.csv")
print("Shape of training set: ", train_set.shape)
test_set = pd.read_csv("/kaggle/input/mercedesbenz-greener-manufacturing/test.csv")
print("Shape of testing set: ", test_set.shape)

print(train_set.head())



Shape of training set:  (4209, 378)
Shape of testing set:  (4209, 377)
   ID       y  X0 X1  X2 X3 X4 X5 X6 X8  ...  X375  X376  X377  X378  X379  \
0   0  130.81   k  v  at  a  d  u  j  o  ...     0     0     1     0     0   
1   6   88.53   k  t  av  e  d  y  l  o  ...     1     0     0     0     0   
2   7   76.26  az  w   n  c  d  x  j  x  ...     0     0     0     0     0   
3   9   80.62  az  t   n  f  d  x  l  e  ...     0     0     0     0     0   
4  13   78.02  az  v   n  f  d  h  d  n  ...     0     0     0     0     0   

   X380  X382  X383  X384  X385  
0     0     0     0     0     0  
1     0     0     0     0     0  
2     0     1     0     0     0  
3     0     0     0     0     0  
4     0     0     0     0     0  

[5 rows x 378 columns]


In [2]:
#check if there are any null rows
print("Is null data present in training set: ", train_set.isnull().any().any())
print("Is null data present in testing set: ", test_set.isnull().any().any())

Is null data present in training set:  False
Is null data present in testing set:  False


In [7]:
print(train_set.dtypes)
train_set = train_set.drop("ID", axis=1)

ID        int64
y       float64
X0       object
X1       object
X2       object
         ...   
X380      int64
X382      int64
X383      int64
X384      int64
X385      int64
Length: 378, dtype: object


In [9]:
#check for string data types and encoding them to integer
for columns in train_set.columns:
    if (train_set[columns].dtype == "object"):
        train_set[columns] = LabelEncoder().fit_transform(train_set[columns])

print(train_set.dtypes)

y       float64
X0        int64
X1        int64
X2        int64
X3        int64
         ...   
X380      int64
X382      int64
X383      int64
X384      int64
X385      int64
Length: 377, dtype: object


In [11]:
#Dividing into dependent and independent variables
X = train_set.iloc[:, train_set.columns != 'y']
Y = train_set.iloc[:, train_set.columns == 'y']
print(X.shape)
print(Y.shape)

(4209, 376)
(4209, 1)


In [12]:
#normalizing the data
min_max_scaler = preprocessing.MinMaxScaler()
X_normalized = min_max_scaler.fit_transform(X)
X = pd.DataFrame(X_normalized)

In [13]:
#since there are 377 independent variables, applying PCA to remove any highly correlated variables
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
principalComponents = pca.fit_transform(X)
X = pd.DataFrame(principalComponents)
print(X.head)

<bound method NDFrame.head of             0         1         2         3         4         5         6   \
0     0.682272  2.217390  1.233625  0.885738  1.401423  0.054226  0.654869   
1    -0.279051  1.164201 -0.764263 -0.660639  0.237862  0.066811  1.237312   
2    -1.018083  2.979512  0.558557  2.540751 -0.926713  3.282629 -0.940275   
3    -0.658559  2.545045 -0.425408  2.997377 -1.681631  3.134973  0.074150   
4    -0.652313  2.370739 -0.583703  3.194208 -1.999394  3.167654 -0.143355   
...        ...       ...       ...       ...       ...       ...       ...   
4204 -2.247975  0.219255  1.467999 -1.141169  0.968223  0.725073  0.153882   
4205  0.905271  0.345494  0.112032  1.821659  1.590330 -0.466509  2.791321   
4206 -1.146539  1.029285  1.524820 -0.049878 -0.927306  0.253831  2.094539   
4207  0.351668 -0.404269 -2.978771  1.422603  0.087980 -0.792489 -0.154592   
4208  0.805073  0.314532 -1.359718 -1.029379  0.008435 -1.156257  0.528667   

            7         8         9

In [14]:
from sklearn.model_selection import train_test_split
(X_train, X_Test, Y_train, Y_Test) = train_test_split(X, Y, test_size = 0.33, random_state = 1)

In [15]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X_train, Y_train)

  return f(**kwargs)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [16]:
from sklearn.metrics import mean_squared_error
Y_pred = model.predict(X_Test)
mean_squared_error(Y_Test, Y_pred)

138.49531346292298

In [17]:
print(Y_pred)
print(np.sqrt(mean_squared_error(Y_Test, Y_pred)))

[ 75.86 107.5  103.62 ... 112.97  88.88  87.44]
11.76840318237453


In [20]:
#Preprocessing test data as well
test_set = pd.read_csv("/kaggle/input/mercedesbenz-greener-manufacturing/test.csv")
test_set = test_set.drop("ID",axis=1)

#label encoding
for columns in test_set.columns:
    if (test_set[columns].dtype == "object"):
        test_set[columns] = LabelEncoder().fit_transform(test_set[columns])

#normalize the data
min_max_scaler = preprocessing.MinMaxScaler()
X_normalized = min_max_scaler.fit_transform(test_set)
test_set = pd.DataFrame(X_normalized)

#applying pca
test_set_pca = pca.transform(test_set)
test_set_df = pd.DataFrame(test_set_pca)


KeyError: "['ID'] not found in axis"

In [None]:
test_pred = model.predict(test_set_df)