In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

import time
import matplotlib.pyplot as plt
import seaborn as sns
import calendar
import folium
import lightgbm as lgbm

**1. DATA EXPLORATION**

**Training data has 55M rows. In this kernel, we shall only read in 5M rows**

In [None]:
start_time = time.time() # track time
train=pd.read_csv("../input/train.csv",nrows=5000000, usecols = [1,2,3,4,5,6,7])
print("%s seconds" % (time.time() - start_time))
train.head()

In [None]:
train.info()

**Reading testing data**

In [None]:
start_time = time.time() # track time
test =  pd.read_csv('../input/test.csv', usecols = [1,2,3,4,5,6])
print("%s seconds" % (time.time() - start_time))

**Compare training data and testing data**

In [None]:
# check the shape 
print (test.shape)
print (train.shape)

# check the head
print (test.head())
print (train.head())

*We can see that testing data lack one column : 'fare_amount', which is what we are going to predict.*

**Check for Missing Values**

In [None]:
#check null value
print(test.isnull().sum())
#check zero value
print((test == 0).astype(int).sum(axis=0))

*The test data is very clean, with no null value or zero value*

In [None]:
# check description
test.describe()

*By checking the description of test data, we can choose to clean the train data base on these value. In other word, we can delete the values that are out of these boundaries in the train data*

In [None]:
#check null value 
print(train.isnull().sum())
#check zero value 
print((train == 0).astype(int).sum(axis=0))

In [None]:
train.describe()

*In our training data, there are some wrong values (for example the min of fare_amount is negative & the max value of passenger count is 208). We have to delete this values base on the value boundary in the test data.*

**2.  DATA CLEANING**

Remove observations with zero or null values 

In [None]:
# Delete null value
print("old: %d" %len(train))
train = train.dropna(how = 'any', axis = 'rows')
print("new: %d" %len(train)) # track data amount before and after deletion

# Delete zero value
print("old: %d" %len(train))
train = train[~(train == 0).any(axis=1)] # or train = train[(train!=0).any(axis=1)]
print("new: %d" %len(train)) 

Remove observations with useless values base on the test data boundary.
You can have some informations about nyc taxi fare from https://www1.nyc.gov/site/tlc/passengers/taxi-fare.page
like initial charge is $2.50 so data with fare less than this value should be removed.

In [None]:
mask = train['pickup_longitude'].between(-74.3, -72.9)
mask &= train['dropoff_longitude'].between(-74.3, -72.9)
mask &= train['pickup_latitude'].between(40.5, 41.8)
mask &= train['dropoff_latitude'].between(40.5, 41.7)
mask &= train['passenger_count'].between(0, 6)
mask &= train['fare_amount'].between(2.5, 200)

print("old: %d" %len(train))
train = train[mask]
print("new: %d" %len(train)) # track data amount before and after deletion

**Date time features**

Convert pickup_datetime from Object to Datetime object

In [None]:
def Convert_datetime(df):
    df['pickup_datetime']=pd.to_datetime(df['pickup_datetime'],format='%Y-%m-%d %H:%M:%S UTC')

Convert_datetime(train)
Convert_datetime(test)
train.head()
test.head()

In [None]:
train.info()

Series.dt.date
Returns numpy array of python datetime.date objects (namely, the date part of Timestamps without timezone information).

In [None]:
def Divide_date(df):
    #df['pickup_date']= df['pickup_datetime'].dt.date
    df['pickup_day']= df['pickup_datetime'].apply(lambda x:x.day)
    df['pickup_hour']= df['pickup_datetime'].apply(lambda x:x.hour)
    df['pickup_month']= df['pickup_datetime'].apply(lambda x:x.month)
    df['pickup_year']= df['pickup_datetime'].apply(lambda x:x.year)
    #df['pickup_day_of_week']=df['pickup_datetime'].apply(lambda x:calendar.day_name[x.weekday()])
    df['weekday'] = df['pickup_datetime'].dt.weekday

In [None]:
Divide_date(train)
Divide_date(test)

**Distribution of Trip Fare**

In [None]:
plt.figure(figsize=(8,5))
sns.kdeplot(train['fare_amount']).set_title("Distribution of Trip Fare")

*Since we saw above that fare amount is highly skewed, let us take log transformation of the fare amount and plot the distribution.*

In [None]:
plt.figure(figsize=(8,5))
sns.kdeplot(np.log(train['fare_amount'].values)).set_title("Distribution of fare amount (log scale)")

**Distribution of Pickup and Dropoff**

Let us look at Geographical Features

In [None]:
#Boundaries of the city
city_long_border = (-74.03, -73.75)
city_lat_border = (40.63, 40.85)

train.plot(kind='scatter', x='dropoff_longitude', y='dropoff_latitude',color='red', s=.02, alpha=.6)
plt.title("Dropoffs")

plt.ylim(city_lat_border)
plt.xlim(city_long_border)

In [None]:
train.plot(kind='scatter', x='pickup_longitude', y='pickup_latitude',color='green', s=.02, alpha=.6)
plt.title("Pickups")

plt.ylim(city_lat_border)
plt.xlim(city_long_border)

*Apart from Manhattan, we can see heavy pickups and dropoffs near JFK and La Guardia Airport.*

* **Distribution of fare amount heatmap**

In [None]:
#Round pickup and dropoff lat lng to 3 decimal places
def Round3(df):
    df['pickup_latitude_round3']=df['pickup_latitude'].apply(lambda x:round(x,3))
    df['pickup_longitude_round3']=df['pickup_longitude'].apply(lambda x:round(x,3))
    df['dropoff_latitude_round3']=df['dropoff_latitude'].apply(lambda x:round(x,3))
    df['dropoff_longitude_round3']=df['dropoff_longitude'].apply(lambda x:round(x,3))

Round3(train)
Round3(test)

In [None]:
pickup_fare_amount = train.groupby(['pickup_latitude_round3','pickup_longitude_round3'])
pickup_fare_amount = pickup_fare_amount['fare_amount'].mean().reset_index().rename(columns={'fare_amount':'avg_fare'})
pickup_fare_amount.head()

*In the scatter plot, we saw the high density of pickups and dropoffs from and to JFK and La Guardia Airport.
Let us look at over time how fares are from La Guardia and JFK*

In [None]:
JFK={'min_lng':-73.8352,
     'min_lat':40.6195,
     'max_lng':-73.7401, 
     'max_lat':40.6659}
JFK_center=[40.6437,-73.7900]

# Get all pickups to JFK
JFK_data=train.loc[(train.pickup_latitude>=JFK['min_lat']) & (train.pickup_latitude<=JFK['max_lat'])]
JFK_data=JFK_data.loc[(train.pickup_longitude>=JFK['min_lng']) & (train.pickup_longitude<=JFK['max_lng'])]

# Get all dropoffs to JFK
JFK_dropoff=train.loc[(train.dropoff_latitude>=JFK['min_lat']) & (train.dropoff_latitude<=JFK['max_lat'])]
JFK_dropoff=JFK_dropoff.loc[(train.dropoff_longitude>=JFK['min_lng']) & (train.dropoff_longitude<=JFK['max_lng'])]

In [None]:
plt.figure(figsize=(8,5))
sns.kdeplot(np.log(JFK_data['fare_amount'].values),label='JFK Pickups')
sns.kdeplot(np.log(train['fare_amount'].values),label='All Trips in Train data')
plt.title("Fare Amount Distribution")

In [None]:
plt.figure(figsize=(8,5))
sns.kdeplot(np.log(JFK_dropoff['fare_amount'].values),label='JFK Droppoffs')
sns.kdeplot(np.log(train['fare_amount'].values),label='All Trips in Train data')
plt.title("Dropoffs vs Fare Amount")

*Distribution of fare amount for both pickup and dropoff to JFK is similar;
the fare amount is much higher when pickup or dropoff are from and to JFK.*

In [None]:
del JFK_data
del JFK_dropoff

*Info : Normally airports pickup or dropoff have fixed prices. We can define non-airport category to see the effect on fare amount.*

**3. Feature engineering**

In [None]:
## Based on the above, let us create a function to see whether pickup or dropoff is an Airport. 
nyc_airports={'JFK':{'min_lng':-73.8352,
     'min_lat':40.6195,
     'max_lng':-73.7401, 
     'max_lat':40.6659},
              
    'EWR':{'min_lng':-74.1925,
            'min_lat':40.6700, 
            'max_lng':-74.1531, 
            'max_lat':40.7081

        },
    'LaGuardia':{'min_lng':-73.8895, 
                  'min_lat':40.7664, 
                  'max_lng':-73.8550, 
                  'max_lat':40.7931
    }
}
def isAirport(latitude,longitude,airport_name='JFK'):
    if latitude>=nyc_airports[airport_name]['min_lat'] and latitude<=nyc_airports[airport_name]['max_lat'] and longitude>=nyc_airports[airport_name]['min_lng'] and longitude<=nyc_airports[airport_name]['max_lng']:
        return 1
    else:
        return 0

In [None]:
def JFK(df):
    df['is_pickup_JFK']=df.apply(lambda row:isAirport(row['pickup_latitude'],row['pickup_longitude'],'JFK'),axis=1)
    df['is_dropoff_JFK']=df.apply(lambda row:isAirport(row['dropoff_latitude'],row['dropoff_longitude'],'JFK'),axis=1)
def EWR(df):
    df['is_pickup_EWR']=df.apply(lambda row:isAirport(row['pickup_latitude'],row['pickup_longitude'],'EWR'),axis=1)
    df['is_dropoff_EWR']=df.apply(lambda row:isAirport(row['dropoff_latitude'],row['dropoff_longitude'],'EWR'),axis=1)
def LaGuardia(df):
    df['is_pickup_la_guardia']=df.apply(lambda row:isAirport(row['pickup_latitude'],row['pickup_longitude'],'LaGuardia'),axis=1)
    df['is_dropoff_la_guardia']=df.apply(lambda row:isAirport(row['dropoff_latitude'],row['dropoff_longitude'],'LaGuardia'),axis=1)

In [None]:
JFK(train)
JFK(test)
EWR(train)
EWR(test)
LaGuardia(train)
LaGuardia(test)

Here is the means for computing earth surface distance in km base on longitude and latitude of pickup and dropoff points.

In [None]:
# Define distance in km
def dist(pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude):
    pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude = map(np.radians, [pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude])
    dlon = dropoff_longitude - pickup_longitude
    dlat = dropoff_latitude - pickup_latitude
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_latitude) * np.cos(dropoff_latitude) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    distance = 6367 * c
    return distance

In [None]:
train['trip_distance']=train.apply(lambda row:dist(row['pickup_longitude'],row['pickup_latitude'],row['dropoff_longitude'],row['dropoff_latitude']),axis=1)
test['trip_distance']=test.apply(lambda row:dist(row['pickup_longitude'],row['pickup_latitude'],row['dropoff_longitude'],row['dropoff_latitude']),axis=1)

In [None]:
train.head()

In [None]:
sns.kdeplot(np.log(train['trip_distance'].values)).set_title("Distribution of Trip Distance (log scale)")

In [None]:
plt.scatter(x=train['trip_distance'],y=train['fare_amount'],alpha=0.1)
plt.xlabel("Trip Distance")
plt.ylabel("Fare Amount")
plt.title("Trip Distance vs Fare Amount")

*The fare seems to be fixed for trip distances > 80 km; generally they are dropoffs and pickups from and to airports.*

In [None]:
non_airport=train.loc[(train['is_dropoff_JFK']==0) & (train['is_dropoff_EWR']==0) & (train['is_dropoff_la_guardia']==0)]
non_airport=non_airport.loc[(non_airport['is_pickup_JFK']==0) & (non_airport['is_pickup_EWR']==0) & (non_airport['is_pickup_la_guardia']==0)]

In [None]:
plt.scatter(x=non_airport['trip_distance'],y=non_airport['fare_amount'],alpha=0.1)
plt.xlabel("Trip Distance")
plt.ylabel("Fare Amount")
plt.title("Trip Distance vs Fare Amount (excluding airport rides)")

*In the plot above, we can see two clusters with linear realtionship between taxi fare and distance. But for trip distances >80 km, though a linear relationship exists, the fare amount is very low. Let us check where these trips originate and end.*

In [None]:
non_airport_long_trips=non_airport[non_airport['trip_distance']>=80]

In [None]:
drop_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 11.7)

for index, row in non_airport_long_trips.iterrows():   
    folium.CircleMarker([row['dropoff_latitude_round3'], row['dropoff_longitude_round3']],
                        radius=3,
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(drop_map)
for index, row in non_airport_long_trips.iterrows():
    folium.CircleMarker([row['pickup_latitude_round3'], row['pickup_longitude_round3']],
                        radius=3,     
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(drop_map)
drop_map

*Most of the long trips dropoffs and pickups are in lower Manhattan. There are a lot of dropoffs in Brooklyn*

New York city is divided into 7 Boroughs. Let us calculate which borough pickup and dropoff points are. And whether that effects the fare

In [None]:
nyc_boroughs={
    'manhattan':{
        'min_lng':-74.0479,
        'min_lat':40.6829,
        'max_lng':-73.9067,
        'max_lat':40.8820
    },
    'queens':{
        'min_lng':-73.9630,
        'min_lat':40.5431,
        'max_lng':-73.7004,
        'max_lat':40.8007
    },
    'brooklyn':{
        'min_lng':-74.0421,
        'min_lat':40.5707,
        'max_lng':-73.8334,
        'max_lat':40.7395
    },
    'bronx':{
        'min_lng':-73.9339,
        'min_lat':40.7855,
        'max_lng':-73.7654,
        'max_lat':40.9176
    },
    'staten_island':{
        'min_lng':-74.2558,
        'min_lat':40.4960,
        'max_lng':-74.0522,
        'max_lat':40.6490
    } 
}
def getBorough(lat,lng):
    locs=nyc_boroughs.keys()
    for loc in locs:
        if lat>=nyc_boroughs[loc]['min_lat'] and lat<=nyc_boroughs[loc]['max_lat'] and lng>=nyc_boroughs[loc]['min_lng'] and lng<=nyc_boroughs[loc]['max_lng']:
            return loc
    return 'others'

In [None]:
train['pickup_borough']=train.apply(lambda row:getBorough(row['pickup_latitude'],row['pickup_longitude']),axis=1)
train['dropoff_borough']=train.apply(lambda row:getBorough(row['dropoff_latitude'],row['dropoff_longitude']),axis=1)

test['pickup_borough']=test.apply(lambda row:getBorough(row['pickup_latitude'],row['pickup_longitude']),axis=1)
test['dropoff_borough']=test.apply(lambda row:getBorough(row['dropoff_latitude'],row['dropoff_longitude']),axis=1)

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(train['pickup_borough'])
plt.title("Distribution of Pickup Boroughs")

In [None]:
plt.figure(figsize=(16,10))
plt.title("Distribution of Fare Amount Across Buroughs")
i=1
for key in nyc_boroughs.keys():
    plt.subplot(3,2,i)
    sns.kdeplot(np.log(train.loc[train['pickup_borough']==key,'fare_amount'].values),label='Pickup '+ key)
    sns.kdeplot(np.log(train.loc[train['dropoff_borough']==key,'fare_amount'].values),label='Dropoff'+ key).set_title("Fare Amount (log scale) for "+key)
    
    i=i+1

*There is a significant difference in pickups and dropoffs fare amount for each burough exceept Manhattan. We can see pickups from Queens is expensive compared to pickups from other Buroughs.Very high difference in pickup and dropoff prices for Staten Island.

In [None]:
plt.figure(figsize=(16,10))
plt.title("Distribution of Trip Distances Across Buroughs")
i=1
for key in nyc_boroughs.keys():
    plt.subplot(3,2,i)
    sns.kdeplot(np.log(train.loc[train['pickup_borough']==key,'trip_distance'].values),label='Pickup '+ key)
    sns.kdeplot(np.log(train.loc[train['dropoff_borough']==key,'trip_distance'].values),label='Dropoff'+ key).set_title("Trip Distance (log scale) for "+key)
    i=i+1

Dropoffs to Bronx and Staten island are long trips. In Manhattan the pickup and dropoffs fare amount has similar distribution. Let us add a field, is_lower_manhattan as we had seen above that dropoffs to lower manhattan had higher trip distance but lower fare

In [None]:
lower_manhattan_boundary={'min_lng': -74.0194,
                          'min_lat':40.6997,
                          'max_lng':-73.9716,
                          'max_lat':40.7427}

def isLowerManhattan(lat,lng):
    if lat>=lower_manhattan_boundary['min_lat'] and lat<=lower_manhattan_boundary['max_lat'] and lng>=lower_manhattan_boundary['min_lng'] and lng<=lower_manhattan_boundary['max_lng']:
        return 1
    else:
        return 0

In [None]:
train['is_pickup_lower_manhattan']=train.apply(lambda row:isLowerManhattan(row['pickup_latitude'],row['pickup_longitude']),axis=1)
train['is_dropoff_lower_manhattan']=train.apply(lambda row:isLowerManhattan(row['dropoff_latitude'],row['dropoff_longitude']),axis=1)

test['is_pickup_lower_manhattan']=test.apply(lambda row:isLowerManhattan(row['pickup_latitude'],row['pickup_longitude']),axis=1)
test['is_dropoff_lower_manhattan']=test.apply(lambda row:isLowerManhattan(row['dropoff_latitude'],row['dropoff_longitude']),axis=1)

In [None]:
manhattan=train.loc[(train['pickup_borough']=='manhattan') | (train['dropoff_borough']=='manhattan')]

plt.figure(figsize=(16,10))
plt.subplot(3,2,1)
sns.kdeplot(np.log(manhattan.loc[manhattan['is_pickup_lower_manhattan']==1,'fare_amount'].values),label='Lower Manhattan Pickups')
sns.kdeplot(np.log(manhattan.loc[manhattan['is_pickup_lower_manhattan']==0,'fare_amount'].values),label='Rest of Manhattan Pickups')
plt.xlabel("fare amount (log)")
plt.title("Distribution of Fare Amount - Manhattan vs Lower Manhattan")

plt.subplot(3,2,2)
sns.kdeplot(np.log(manhattan.loc[manhattan['is_dropoff_lower_manhattan']==1,'fare_amount'].values),label='Lower Manhattan Dropoffs')
sns.kdeplot(np.log(manhattan.loc[manhattan['is_dropoff_lower_manhattan']==0,'fare_amount'].values),label='Rest of Manhattan Dropoffs')
plt.xlabel("fare amount (log)")
plt.title("Distribution of Fare Amount - Manhattan vs Lower Manhattan")

In [None]:
plt.scatter(x=manhattan.loc[manhattan['is_pickup_lower_manhattan']==1,'trip_distance'].values,y=manhattan.loc[manhattan['is_pickup_lower_manhattan']==1,'fare_amount'].values,alpha=0.1)
plt.xlabel("Trip Distance")
plt.ylabel("Fare Amount")
plt.title("Trip Distance vs Fare Amount (Lower Manhattan pickups)")

In [None]:
plt.scatter(x=manhattan.loc[manhattan['is_pickup_lower_manhattan']==0,'trip_distance'].values,y=manhattan.loc[manhattan['is_pickup_lower_manhattan']==0,'fare_amount'].values,alpha=0.1)
plt.xlabel("Trip Distance")
plt.ylabel("Fare Amount")
plt.title("Trip Distance vs Fare Amount (Rest of Manhattan pickups)")

In [None]:
plt.scatter(x=manhattan.loc[manhattan['is_dropoff_lower_manhattan']==1,'trip_distance'].values,y=manhattan.loc[manhattan['is_dropoff_lower_manhattan']==1,'fare_amount'].values,alpha=0.1)
plt.xlabel("Trip Distance")
plt.ylabel("Fare Amount")
plt.title("Trip Distance vs Fare Amount (Lower Manhattan dropoffs)")

In [None]:
plt.scatter(x=manhattan.loc[manhattan['is_dropoff_lower_manhattan']==0,'trip_distance'].values,y=manhattan.loc[manhattan['is_dropoff_lower_manhattan']==0,'fare_amount'].values, alpha=0.1)
plt.xlabel("Trip Distance")
plt.ylabel("Fare Amount")
plt.title("Trip Distance vs Fare Amount (Rest of Manhattan dropoffs)")

The distribution of trip distance and fare amount for Lower Manhattan pickups and dropoffs is very different. Also, slope of linear realtionship for pickups for Lower Manhattan is higher than that for Rest of Manhattan

Let us now look at datetime features and their realtionship with Fare Amount

In [None]:
trips_year_fareamount=train.groupby(['pickup_year'])['fare_amount'].mean().reset_index().rename(columns={'fare_amount':'avg_fare_amount'})
trips_year_fareamount.head()

In [None]:
sns.barplot(x='pickup_year',y='avg_fare_amount',data=trips_year_fareamount).set_title("Average Fare Amount over Years")

*Average Fare amount has beern increasing over the years.*

In [None]:
#Info : There is a daily 50-cent surcharge from 8pm to 6am.
train['night_hour'] = np.where((train['pickup_hour'] >= 20) | (train['pickup_hour'] <= 6) , 1, 0)
test['night_hour'] = np.where((test['pickup_hour'] >= 20) | (test['pickup_hour'] <= 6) , 1, 0)

#Info : There is a $1 surcharge from 4pm to 8pm on weekdays, excluding holidays.
train['peak_hour'] = np.where((train['pickup_hour'] >= 16) 
                              & (train['pickup_hour'] <= 20) 
                              &  (train['weekday'] >= 0) 
                              & (train['weekday'] <= 4) , 1, 0)
    
test['peak_hour'] = np.where((test['pickup_hour'] >= 16) &
                            (test['pickup_hour'] <= 20) & 
                            (test['weekday'] >=0) &
                            (test['weekday'] <=4) , 1, 0)

**4. Prediction**

In [None]:
print (test.shape)
print (train.shape)

In [None]:
# Check corr of 'fare_amount' to all the other variables
print(train.corrwith(train['fare_amount']))

In [None]:
#We can choose to remove the variables that has the corr less than 0.1
train = train.drop(['passenger_count','pickup_hour','pickup_day','pickup_month','weekday'], axis = 1)
test = test.drop(['passenger_count','pickup_hour','pickup_day','pickup_month','weekday'], axis = 1)

In [None]:
train = train.drop(['pickup_datetime', 'pickup_borough', 'dropoff_borough'], axis = 1)
test = test.drop(['pickup_datetime', 'pickup_borough', 'dropoff_borough'], axis = 1)

In [None]:
#Split the train data for model training
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(train.drop('fare_amount', axis=1),
                                                    train['fare_amount'], test_size=0.15, random_state = 111)

# Check shape
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Use lightgbm model to do the training.
params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 4,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,
        'num_rounds':50000
    }
def LGBMmodel(X_train,X_test,y_train,y_test,params):
    matrix_train = lgbm.Dataset(X_train, y_train)
    matrix_test = lgbm.Dataset(X_test, y_test)
    model=lgbm.train(params=params,
                    train_set=matrix_train,
                    num_boost_round=100000, 
                    early_stopping_rounds=500,
                    verbose_eval=100,
                    valid_sets=matrix_test)
    return model

In [None]:
# Train the model

model = LGBMmodel(X_train,X_test,y_train,y_test,params)