# Pre-processing

## SetUp

In [2]:
import pandas as pd 
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,shuffle=True)

In [2]:
data=pd.read_csv('prearedData.csv', index_col=0)
df=data.copy()

### 1. Feature Selection

In [3]:
data.columns

Index(['manufacturer', 'model', 'year', 'version', 'power(kW)', 'fuel_date',
       'odometer', 'trip_distance(km)', 'quantity(kWh)', 'fuel_type',
       'tire_type', 'city', 'motor_way', 'country_roads', 'driving_style',
       'consumption(kWh/100km)', 'A/C', 'park_heating', 'avg_speed(km/h)',
       'ecr_deviation', 'fuel_note', 'user_id'],
      dtype='object')

In [4]:
data_sel=data[[ 'power(kW)', 'fuel_date',
      'trip_distance(km)', 'quantity(kWh)', 
       'tire_type', 'city', 'motor_way', 'country_roads', 'driving_style',
        'park_heating', 'avg_speed(km/h)',
       'ecr_deviation']]

In [5]:
data_sel.head()

Unnamed: 0,power(kW),fuel_date,trip_distance(km),quantity(kWh),tire_type,city,motor_way,country_roads,driving_style,park_heating,avg_speed(km/h),ecr_deviation
13,100.0,14.01.2022,59.0,6.0,All-year tires,0.0,1.0,0.0,Normal,0.0,,4.1
14,100.0,13.01.2022,59.0,23.0,All-year tires,0.0,1.0,0.0,Normal,0.0,,4.1
15,100.0,13.01.2022,49.0,7.0,All-year tires,0.0,1.0,0.0,Normal,0.0,,4.1
16,100.0,13.01.2022,71.0,4.0,All-year tires,0.0,1.0,0.0,Normal,0.0,,4.1
17,100.0,12.01.2022,59.0,20.0,All-year tires,0.0,1.0,0.0,Normal,0.0,,4.1


### 2. Data type Conversion

In [6]:
data_sel.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31736 entries, 13 to 31748
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   power(kW)          31734 non-null  float64
 1   fuel_date          31736 non-null  object 
 2   trip_distance(km)  31734 non-null  float64
 3   quantity(kWh)      31734 non-null  float64
 4   tire_type          31646 non-null  object 
 5   city               31734 non-null  float64
 6   motor_way          31734 non-null  float64
 7   country_roads      31734 non-null  float64
 8   driving_style      31625 non-null  object 
 9   park_heating       31734 non-null  float64
 10  avg_speed(km/h)    5463 non-null   object 
 11  ecr_deviation      27573 non-null  float64
dtypes: float64(8), object(4)
memory usage: 3.1+ MB


In [7]:
data_sel.loc[:,'tire_type']=data_sel['tire_type'].map({'All-year tires':1,'Summer tires':3,'Winter tires':2, 'tire_type':0})
data_sel.loc[:,'driving_style']=data_sel['driving_style'].map({'Fast':1,'Moderate':2,'Normal':3})
data_sel=data_sel[(data_sel['driving_style']!='driving_style') & data_sel['tire_type']!='tire_type']
def convert_to_numeric(columns, df):
    for col in columns:
        df[col]=pd.to_numeric(df[col], errors='coerce')
    return df
        
colums_for_num_cov=['power(kW)','quantity(kWh)','consumption(kWh/100km)','trip_distance(km)','ecr_deviation','odometer']
data_sel_num=convert_to_numeric(data_sel.drop(columns='fuel_date').columns, data_sel) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [8]:
data_sel_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31736 entries, 13 to 31748
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   power(kW)          31734 non-null  float64
 1   fuel_date          31736 non-null  object 
 2   trip_distance(km)  31734 non-null  float64
 3   quantity(kWh)      31734 non-null  float64
 4   tire_type          31646 non-null  float64
 5   city               31734 non-null  float64
 6   motor_way          31734 non-null  float64
 7   country_roads      31734 non-null  float64
 8   driving_style      31623 non-null  float64
 9   park_heating       31734 non-null  float64
 10  avg_speed(km/h)    5461 non-null   float64
 11  ecr_deviation      27573 non-null  float64
dtypes: float64(11), object(1)
memory usage: 3.1+ MB


In [9]:
data_sel_num.head()

Unnamed: 0,power(kW),fuel_date,trip_distance(km),quantity(kWh),tire_type,city,motor_way,country_roads,driving_style,park_heating,avg_speed(km/h),ecr_deviation
13,100.0,14.01.2022,59.0,6.0,1.0,0.0,1.0,0.0,3.0,0.0,,4.1
14,100.0,13.01.2022,59.0,23.0,1.0,0.0,1.0,0.0,3.0,0.0,,4.1
15,100.0,13.01.2022,49.0,7.0,1.0,0.0,1.0,0.0,3.0,0.0,,4.1
16,100.0,13.01.2022,71.0,4.0,1.0,0.0,1.0,0.0,3.0,0.0,,4.1
17,100.0,12.01.2022,59.0,20.0,1.0,0.0,1.0,0.0,3.0,0.0,,4.1


### 3. Tranform Fuel Date

In [10]:

data_sel_num['fuel_date']=data_sel_num['fuel_date'].apply(lambda x :x.replace(".",'-'))
data_sel_num['fuel_date_new']=pd.to_datetime(data_sel_num['fuel_date'], errors='coerce', format='%d-%m-%Y')
# extracting month , day and day of the week from fuel_date
data_sel_num['month']=data_sel_num['fuel_date_new'].dt.month
data_sel_num['weekday']=data_sel_num['fuel_date_new'].dt.weekday
data_sel_num['day']=data_sel_num['fuel_date_new'].dt.day
data_sel_num.drop(columns=['fuel_date','fuel_date_new'], axis=1, inplace=True)

In [11]:
data_sel_num.head()

Unnamed: 0,power(kW),trip_distance(km),quantity(kWh),tire_type,city,motor_way,country_roads,driving_style,park_heating,avg_speed(km/h),ecr_deviation,month,weekday,day
13,100.0,59.0,6.0,1.0,0.0,1.0,0.0,3.0,0.0,,4.1,1.0,4.0,14.0
14,100.0,59.0,23.0,1.0,0.0,1.0,0.0,3.0,0.0,,4.1,1.0,3.0,13.0
15,100.0,49.0,7.0,1.0,0.0,1.0,0.0,3.0,0.0,,4.1,1.0,3.0,13.0
16,100.0,71.0,4.0,1.0,0.0,1.0,0.0,3.0,0.0,,4.1,1.0,3.0,13.0
17,100.0,59.0,20.0,1.0,0.0,1.0,0.0,3.0,0.0,,4.1,1.0,2.0,12.0


### 4. Clipping outliers

In [12]:
data_sel_num=data_sel_num[data_sel_num['trip_distance(km)']<=410.00]

 ### 5. Filling Missing values

In [13]:
data_sel_num.fillna(value=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [14]:
data_sel_num.head()

Unnamed: 0,power(kW),trip_distance(km),quantity(kWh),tire_type,city,motor_way,country_roads,driving_style,park_heating,avg_speed(km/h),ecr_deviation,month,weekday,day
13,100.0,59.0,6.0,1.0,0.0,1.0,0.0,3.0,0.0,0.0,4.1,1.0,4.0,14.0
14,100.0,59.0,23.0,1.0,0.0,1.0,0.0,3.0,0.0,0.0,4.1,1.0,3.0,13.0
15,100.0,49.0,7.0,1.0,0.0,1.0,0.0,3.0,0.0,0.0,4.1,1.0,3.0,13.0
16,100.0,71.0,4.0,1.0,0.0,1.0,0.0,3.0,0.0,0.0,4.1,1.0,3.0,13.0
17,100.0,59.0,20.0,1.0,0.0,1.0,0.0,3.0,0.0,0.0,4.1,1.0,2.0,12.0


In [None]:
# Fitting the model
xgb_r.fit(X_train_rf,np.ravel(y_train))