In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib import style

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge

import pickle

TRAIN_NEW_MODEL = True
NEW_MODEL_TRAIN_COUNT = 2000

In [None]:
# Read csv and output feature columns.
df = pd.read_csv('fortune_500.csv')
df.columns

In [None]:
# Remove all empty rows and get information about the data types in the df.
df = df.fillna(0)
df = df[df.Employees < 2_000_000]
df.info()

In [None]:
# Understand different relationship between different variables with seaborn.
sns.pairplot(df)

In [None]:
x = df[['Profits', 'Employees', 'Assets', 'Totshequity']]  # Variables used to make predictions
y = df['Revenues']  # Variables we are going to predict
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

def train_new_model():
    """
    Train new model by attempting to create the best model in a certain
    amount of tries - NEW_MODEL_TRAIN_COUNT.
    Save the best model as a pickle file.
    """
    best_lin_reg = 0
    best_lasso_reg = 0
    for _ in range(NEW_MODEL_TRAIN_COUNT):

        # Split data into training and testing.
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

        # Apply linear regression model.
        RidgeReg = Ridge(alpha=1, normalize=True)
        RidgeReg.fit(x_train, y_train)

        # Calculate accuracy and swap model if it's better than the current best.
        accuracy = RidgeReg.score(x_test, y_test)
        if accuracy > best_lin_reg:
            best_lin_reg = accuracy
            with open('fortune_500_lin_reg_model.pickle', 'wb') as f:
                pickle.dump(RidgeReg, f)

        # Similar approach for Lasso regression.
        LassoReg = Lasso(alpha=1, normalize=True)
        LassoReg.fit(x_train, y_train)
        accuracy = LassoReg.score(x_test, y_test)
        if accuracy > best_lasso_reg:
            best_lasso_reg = accuracy
            with open('fortune_500_lasso_reg_model.pickle', 'wb') as f:
                pickle.dump(LassoReg, f)

# Train new model if specified, otherwise populate it from the pickle file.
if TRAIN_NEW_MODEL:
    train_new_model()
pickle_in = open('fortune_500_ridge_reg_model.pickle', 'rb')
RidgeReg = pickle.load(pickle_in)
pickle_in = open('fortune_500_lasso_reg_model.pickle', 'rb')
LassoReg = pickle.load(pickle_in)

In [None]:
# Evaluate the model's coefficients.
print('Accuracy: ', RidgeReg.score(x_test, y_test))
pd.DataFrame(RidgeReg.coef_, x.columns, columns = ['Coeff'])

In [None]:
# Make predictions:
predictions = RidgeReg.predict(x_test)

# Produce scatter graph between y_test values and predictions.
style.use('ggplot')
plt.scatter(y_test, predictions)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()

# Draw line of best fit.
slope, y_intercept = np.polyfit(y_test, predictions, 1)
plt.plot(y_test, predictions, 'o')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.plot(y_test, slope*y_test + y_intercept)

In [None]:
plt.hist(y_test - predictions)

In [None]:
def predict_individual(model, data, actual_revenue=None):
    """
    Predict an individual value given a model and data.
    :param actual_revenue: (optional) to show the accuracy of the prediction
    """
    prediction = model.predict([data])[0]
    print('Prediction: ', prediction)
    if actual_revenue:
        print('Accuracy: ', (prediction / actual_revenue if prediction < actual_revenue else actual_revenue / prediction)  * 100, '%')
    return prediction

# Predict: Core-Mark Holding
predict_individual(RidgeReg, [54.2, 7688, 1497, 530], actual_revenue=11507)

In [None]:
# The mean squared error of Ridge regression model.
print("Mean squared error: %.2f" % np.mean((RidgeReg.predict(x_test) - y_test) ** 2))

In [None]:
# Now apply the same approach to lasso regression in order to compare it against ridge regression.

# Evaluate the model's coefficients.
print('Accuracy: ', LassoReg.score(x_test, y_test))
pd.DataFrame(LassoReg.coef_, x.columns, columns = ['Coeff'])

In [None]:
# Make predictions.
predictions = LassoReg.predict(x_test)

# Draw line of best fit.
slope, y_intercept = np.polyfit(y_test, predictions, 1)
print('Slope: ', slope)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.plot(y_test, predictions, 'o')
plt.plot(y_test, slope*y_test + y_intercept)

In [None]:
# Same individual prediction as RidgeReg.
# Predict: Core-Mark Holding
predict_individual(LassoReg, [54.2, 7688, 1497, 530], actual_revenue=11507)

In [None]:
# The mean squared error of Lasso regression model.
print("Mean squared error: %.2f" % np.mean((LassoReg.predict(x_test) - y_test) ** 2))