In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

> **download data**

In [None]:
df = pd.read_csv('../input/renfe.csv', parse_dates=['insert_date', 'start_date', 'end_date'])

> **data processing**

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df = df.dropna()

In [None]:
# remove not needed columns
df.drop(["Unnamed: 0"], axis = 1, inplace = True)
df.drop(["insert_date"], axis = 1, inplace = True)

In [None]:
# add a column with duration
df['duration (h)'] = (df['end_date']-df['start_date']).dt.seconds/3600

In [None]:
#origin unique values
df['origin'].unique()

In [None]:
# destination unique values
df['destination'].unique()

In [None]:
# add a column with trip distance - it will allow us to differentiate routes
# because there are only few routes, we can treat this variable as a string, not necessarily as a continuous one

# create a dictionary with stations cordinates
coordinates = {'MADRID':[40.4065,3.6896],\
               'SEVILLA':[37.3911,5.9755],\
               'PONFERRADA':[42.5454,6.6023],\
               'BARCELONA':[41.3790,2.1399],\
               'VALENCIA':[39.4666,0.3773]}

# create columns with origin and destination coordinates
df['origin_coordinates'] = df['origin'].apply(lambda x: coordinates[x])
df['destination_coordinates'] = df['destination'].apply(lambda x: coordinates[x])

# put together a function for calculating distance
def distance(origin, destination):
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371 # km

    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c
    
    return d

# create the distance function
df['distance (km)'] = df.apply(lambda row: distance(row['origin_coordinates'],row['destination_coordinates']), axis=1)

# get basic statistics for the function
df['distance (km)'].describe()

# remove columns not needed aymore
df = df.drop(columns=['origin_coordinates','destination_coordinates'])

In [None]:
# create a column with origin and destination (this variable is effectively equivalent to the distance one)
df['origin - destination'] = df['origin']+' - '+df['destination']

In [None]:
# create variables to reflect seasonal price movements
df['month'] = df['start_date'].dt.month
df['weekday'] = df['start_date'].dt.weekday
df['day'] = df['start_date'].dt.day
df['hour'] = df['start_date'].dt.hour
df['minute'] = df['start_date'].dt.minute

In [None]:
df.head()

> **data analysis**

In [None]:
# number of trips by journey
fig,ax = plt.subplots(figsize=(20,6))
ax = sns.countplot(df['origin - destination'])
plt.show()

# price by journey
f,ax = plt.subplots(figsize=(20,6))
ax = sns.boxplot(x='origin - destination',y='price',data=df)
plt.show()

In [None]:
# number of trips by type of train
fig,ax = plt.subplots(figsize=(20,6))
ax = sns.countplot(df['train_type'])
plt.show()

# price by type of train
f,ax = plt.subplots(figsize=(20,6))
ax = sns.boxplot(x='train_type',y='price',data=df)
plt.show()

In [None]:
# number of trips by class
fig,ax = plt.subplots(figsize=(20,6))
ax = sns.countplot(df['train_class'])
plt.show()

# price by class
f,ax = plt.subplots(figsize=(20,6))
ax = sns.boxplot(x='train_class',y='price',data=df)
plt.show()

In [None]:
# number of trips by category type
fig,ax = plt.subplots(figsize=(20,6))
ax = sns.countplot(df['fare'])
plt.show()

# price by category type
f,ax = plt.subplots(figsize=(20,6))
ax = sns.boxplot(x='fare',y='price',data=df)
plt.show()

In [None]:
# price distribution
f,ax = plt.subplots(figsize=(20,6))
ax = sns.distplot(df['price'])
plt.show()

In [None]:
# correlation plot
f,ax = plt.subplots(figsize=(20, 20))
sns.heatmap(pd.get_dummies(df).corr(), annot=False, cmap = "Greens", linewidths=.5, fmt= '.2f',ax = ax)
plt.show()

> **data modelling**

**gradient boost**

In [None]:
# target variable
y = df['price']

In [None]:
# features
features = pd.concat([pd.get_dummies(df[['train_type','train_class','fare','month','weekday','day','hour','minute','duration (h)']]),\
           pd.get_dummies(df[['distance (km)']].astype(str)),\
                     ],axis=1)

In [None]:
# split into train and test 
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.1)

In [None]:
# gradient boost model with 6000 estimators
gbr = LGBMRegressor(n_estimators=6000)
gbr.fit(X_train, y_train)

In [None]:
gbr.score(X_train, y_train)

In [None]:
gbr.score(X_test, y_test)