I couldn't get LightGBM to work

# Importing the library

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedKFold, train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import BayesianRidge, Lasso, LinearRegression, Ridge, RidgeCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Loading the data

In [None]:
df = pd.read_csv("../input/flight-take-off-data-jfk-airport/M1_final.csv")
df.head()

# Exploring the data

Knowledge about the data you are working on is very important for data analysis.

What I have used:

- [pandas.DataFrame.info](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.info.html)

- [pandas.DataFrame.isna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.isna.html)

- [pandas.DataFrame.select_dtypes](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.select_dtypes.html)

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.select_dtypes(["object_"]).head()

Hmm... do you notice something weird? `Dew Point` should have been a dtype of numeric but it has a dtype of object! Let's fix this. 

In [None]:
df["Dew Point"] = df["Dew Point"].astype("int64")

# Let's check if this fixed it.
df.select_dtypes(["object_"]).head()

# Dropping null values

Previously when we were exploring the data, we saw that `Wind` column had some missing values. As our data set is large and the number of missing values are very very small as compared to that of our data set, we can safely drop the rows!

In [None]:
df[df["Wind"].isna()]

In [None]:
df.dropna(inplace=True)

In [None]:
df.isna().sum()["Wind"]

# Creating copies of our DataFrame

In [None]:
df_le = df.copy() # will be used for label encoding
df_ohe = df.copy() # will be used for one hot encoding

# Label Encoding the categorical columns

Most of the machine learning models don't like text. What do they like? Numbers. **Label Encoding** is an important part of **feature engineering** which is used to convert categorical data into numerical form so that they can be provided to our machine learning model.

Example: If we have two colors red and green

```
+-------+
| color |
+-------+
|  red  |
+-------+
| green |
+-------+
```

After applying Label Encoding this would be converted into something like:

```
+-------+
| color |
+-------+
|   0   |
+-------+
|   1   |
+-------+
```

We can do this by using [sklearn.preprocessing.LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html)

Extra Resource: https://www.geeksforgeeks.org/ml-label-encoding-of-datasets-in-python/

In [None]:
le = LabelEncoder()

for x in df_le.select_dtypes(["object_"]).columns:
    df_le[x] = le.fit_transform(df_le[x]).astype("str")

In [None]:
df_le.select_dtypes(["object_"]).head()

# Building Models for Label Encoded Data

## Creating features (X) and label (y)

Features are often referred to as "independent variables" and Label is often referred to as "dependent variable".

Here `TAXI_OUT` is our label because it depends on other features.

In [None]:
X = df_le.drop("TAXI_OUT", axis=1)
y = df_le["TAXI_OUT"]

## Splitting the data in training and testing set


- Training data set is used for fitting our model to learn the patterns.
- Testing data set is used for prediction and unbiased evaluation of our final model

We can do this by using [sklearn.model_selection.train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html).

Training data set - 90% of the total data

Testing data set - 10% of the total data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
def rmse(y_true, y_pred):
    """Helper function for calculating root mean squared error."""
    return round(mean_squared_error(y_true, y_pred) ** 0.5, 2)

In [None]:
# Creating ndarray for storing rmse values for models with Label Encoding
rmse_le = np.zeros(7, dtype="float64")

## Linear Regression

In [None]:
reg1 = LinearRegression().fit(X_train, y_train)
y_pred1 = reg1.predict(X_test)
rmse_le[0] = rmse(y_test, y_pred1)
rmse_le[0]

## Ridge Regression

In [None]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
reg = RidgeCV(alphas=np.arange(0, 1, 0.01), cv=cv, scoring='neg_mean_absolute_error')
reg.fit(X_train, y_train)
print(f"alpha: {reg.alpha_}")

In [None]:
reg2 = Ridge(alpha=0.0, normalize=True).fit(X_train, y_train)
y_pred2 = reg2.predict(X_test)
rmse_le[1] = rmse(y_test, y_pred2)
rmse_le[1]

## Lasso Regression

In [None]:
reg3 = Lasso().fit(X_train, y_train)
y_pred3 = reg3.predict(X_test)
rmse_le[2] = rmse(y_test, y_pred3)
rmse_le[2]

## KNN Model

In [None]:
err_rate = []
for i in range(1, 50):
    knn = KNeighborsClassifier(n_neighbors=i).fit(X_train,y_train)
    y_pred = knn.predict(X_test)
    err_rate.append(np.mean(y_pred != y_test))
k_index = err_rate.index(min(err_rate))
min_err = min(err_rate)
print(f"Minimum error of {min_err} at K = {k_index}.")

In [None]:
reg4 = KNeighborsRegressor(n_neighbors=31).fit(X_train, y_train)
y_pred4 = reg4.predict(X_test)
rmse_le[3] = rmse(y_test, y_pred4)
rmse_le[3]

## Support Vector Regressor (SVR)

In [None]:
reg5 = SVR().fit(X_train, y_train)
y_pred5 = reg5.predict(X_test)
rmse_le[4] = rmse(y_test, y_pred5)
rmse_le[4]

## Naive Bayes

In [None]:
reg6 = BayesianRidge().fit(X_train, y_train)
y_pred6 = reg6.predict(X_test)
rmse_le[5] = rmse(y_test, y_pred6)
rmse_le[5]

## Random Forest Regressor

In [None]:
reg7 = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_train, y_train)
y_pred7 = reg7.predict(X_test)
rmse_le[6] = rmse(y_test, y_pred4)
rmse_le[6]

# One Hot Encoding the categorical columns

**One Hot Encoding** is an important part of __feature engineering__ which is used to convert categorial data into numerical form so that they can be provided to our machine learning model.

Example: If we have two colors red and green and we want to represent red we could do something like

```
+---+-----+
|red|green|
+---------+
| 1 |  0  |
+---+-----+
```

These are often referred to as "dummy variables".

We can do this by using [pandas.get_dummies](https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html)

Alternate: [sklearn.preprocessing.OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)

Extra Resource: https://www.educative.io/blog/one-hot-encoding

In [None]:
# Label Encoding some of the columns because they mess up the model
le = LabelEncoder()

df_ohe["TAIL_NUM"] = le.fit_transform(df_ohe["TAIL_NUM"]).astype(str)
df_ohe["Wind"] = le.fit_transform(df_ohe["Wind"]).astype(str)
df_ohe["Condition"] = le.fit_transform(df_ohe["Condition"]).astype(str)

df_ohe = pd.get_dummies(df_ohe, columns=["MONTH", "DAY_OF_WEEK", "OP_UNIQUE_CARRIER", "DEST"])

# Building Models for One Hot Encoded Data

## Creating features (X) and label (y)

In [None]:
X = df_ohe.drop("TAXI_OUT", axis=1)
y = df_ohe["TAXI_OUT"]

## Splitting the data in training and testing set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
# Creating ndarray for storing rmse values for models with One Hot Encoding
rmse_ohe = np.zeros(7, dtype="float64")

## Linear Regression

In [None]:
reg1 = LinearRegression().fit(X_train, y_train)
y_pred1 = reg1.predict(X_test)
rmse_ohe[0] = rmse(y_test, y_pred1)
rmse_ohe[0]

## Ridge Regression

In [None]:
reg2 = Ridge(alpha=0.0, normalize=True).fit(X_train, y_train)
y_pred2 = reg2.predict(X_test)
rmse_ohe[1] = rmse(y_test, y_pred2)
rmse_ohe[1]

## Lasso Regression

In [None]:
reg3 = Lasso().fit(X_train, y_train)
y_pred3 = reg3.predict(X_test)
rmse_ohe[2] = rmse(y_test, y_pred3)
rmse_ohe[2]

## KNN Model

In [None]:
err_rate = []
for i in range(1, 50):
    knn = KNeighborsClassifier(n_neighbors=i).fit(X_train,y_train)
    y_pred = knn.predict(X_test)
    err_rate.append(np.mean(y_pred != y_test))
k_index = err_rate.index(min(err_rate))
min_err = min(err_rate)
print(f"Minimum error of {min_err} at K = {k_index}.")

In [None]:
reg4 = KNeighborsRegressor(n_neighbors=36).fit(X_train, y_train)
y_pred4 = reg4.predict(X_test)
rmse_ohe[3] = rmse(y_test, y_pred4)
rmse_ohe[3]

## Support Vector Regressor (SVR)

In [None]:
reg5 = SVR().fit(X_train, y_train)
y_pred5 = reg5.predict(X_test)
rmse_ohe[4] = rmse(y_test, y_pred5)
rmse_ohe[4]

## Naive Bayes

In [None]:
reg6 = BayesianRidge().fit(X_train, y_train)
y_pred6 = reg6.predict(X_test)
rmse_ohe[5] = rmse(y_test, y_pred6)
rmse_ohe[5]

## Random Forest Regressor

In [None]:
reg7 = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_train, y_train)
y_pred7 = reg7.predict(X_test)
rmse_ohe[6] = rmse(y_test, y_pred4)
rmse_ohe[6]

# Conclusion

In [None]:
models = [
          "Linear Regression", "Ridge Regression", "Lasso Regression",
          "KNN Model", "Support Vector Regression", "Naive Bayes",
          "Randrom Forest Regressor"
         ]

plt.plot(models, rmse_le, label="Label Encoding")
plt.plot(models, rmse_ohe, label="One Hot Encoding")

plt.xlabel("Model Name")
plt.xticks(rotation = -90)
plt.ylabel("RMSE")

plt.legend()
plt.show()