In [None]:
## Import Libraries used to perform analysis

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from sklearn.model_selection import train_test_split

### **Importing a Simple .csv File**

Starting form import the file .csv using method pd.read_csv('location')

In [None]:
# Import file from your computer

from google.colab import files
uploaded = files.upload()

In [None]:
#Import file and assign it as df

import io

#Assuming you uploaded a file named 'auto-msg.csv'
df = pd.read_csv(io.BytesIO(uploaded['auto-msg.csv']))

#Display the frist few rows
print(df.head())

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
# Assigning X variables (Independent Variables) and Y Variable (Dependent Variable)

X = df[['cylinders','displacement','horsepower','weight','acceleration','model year','origin']]
X = sm.add_constant(X)
y = df['mpg']

In [None]:
model = sm.OLS(y,X).fit()

In [None]:
np.asarray(df)

In [None]:
print(df.dtypes)

In [None]:
print(df['horsepower'].unique())
print(df['horsepower'].value_counts())

In [None]:
# Spotting any error in the dataset
## There is something as an object NOT the number which we want to analyse

df[df['horsepower'] == '?']

In [None]:
# horsepower is object !!!
## remove horsepower = ?

df = df[df['horsepower'] != '?']
df['horsepower'] = df['horsepower'].astype(int)

In [None]:
X = df[['cylinders','displacement','horsepower','weight','acceleration','model year','origin']]
X = sm.add_constant(X)
y = df['mpg']

In [None]:
model = sm.OLS(y,X).fit()

In [None]:
model.params

In [None]:
# Assuming 'X' and 'Y' are your data and 'results' is your fitted model object

# Get the predicted values
y_pred = model.predict(X)

# Plot the predictions vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y, y_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linestyle='--')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Predicted vs Actual Values')
plt.grid(True)
plt.show()

# Show the statistical summary
print(model.summary())

## Having train / test sample

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 20, random_state=123)

In [None]:
lr = sm.OLS(y_train,X_train).fit()

In [None]:
lr.params

In [None]:
lr.summary()

In [None]:
# Get the predicted values
y_pred = lr.predict(X_test)

# Plot the predictions vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linestyle='--')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Predicted vs Actual Values')
plt.grid(True)
plt.show()

# Show the statistical summary
print(lr.summary())

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Assuming 'X' and 'Y' are your data and 'results' is your fitted model object

# Get the predicted values
y_pred = model.predict(X_test)

# Plot the predictions vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linestyle='--')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Predicted vs Actual Values')
plt.grid(True)
plt.show()

# Show the statistical summary
print(model.summary())

In [None]:
a_model = pd.DataFrame(model.predict(X_test), columns = ['y_pred']).join(pd.DataFrame(y_test).rename({'mpg':'y_true'},axis = 1))
b_model = pd.DataFrame(lr.predict(X_test), columns = ['y_pred']).join(pd.DataFrame(y_test).rename({'mpg':'y_true'},axis = 1))

In [None]:
print(f"fit all data model -> MAE : {abs(a_model['y_pred'] - a_model['y_true']).sum()}")
print(f"fit training data model -> MAE : {abs(b_model['y_pred'] - b_model['y_true']).sum()}")

# of course the all data model has seen the test data that's why it is more fit than the other model

## Function

In [None]:
def model_prediction(df, X_col, train_test = False):
    X = df[X_col]
    X = sm.add_constant(X)
    y = df['mpg']
    if train_test == False:
        model = sm.OLS(y,X).fit()
        print(model.params)

        # Assuming 'X' and 'y' are your data and 'results' is your fitted model object

        # Get the predicted values
        y_pred = model.predict(X)

        # Plot the predictions vs actual values
        plt.figure(figsize=(10, 6))
        plt.scatter(y, y_pred, alpha=0.5)
        plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linestyle='--')
        plt.xlabel('Actual Values')
        plt.ylabel('Predicted Values')
        plt.title('Predicted vs Actual Values')
        plt.grid(True)
        plt.show()

        # Show the statistical summary
        print(model.summary())
        res = pd.DataFrame(model.predict(X), columns = ['y_pred']).join(pd.DataFrame(y).rename({'mpg':'y_true'},axis = 1))
        print(f"MSE : {((res['y_pred'] - res['y_true'])**2).mean()} ")
        print(f"MAE : {(abs(res['y_pred'] - res['y_true'])).mean()} ")

    else:
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 20, random_state=123)
        model = sm.OLS(y_train,X_train).fit()
        print(model.params)

        # Assuming 'X' and 'y' are your data and 'results' is your fitted model object

        # Get the predicted values
        y_pred = model.predict(X_test)

        # Plot the predictions vs actual values
        plt.figure(figsize=(10, 6))
        plt.scatter(y_test, y_pred, alpha=0.5)
        plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linestyle='--')
        plt.xlabel('Actual Values')
        plt.ylabel('Predicted Values')
        plt.title('Predicted vs Actual Values')
        plt.grid(True)
        plt.show()

        # Show the statistical summary
        print(model.summary())
        print('------------------------------------------------')
        res = pd.DataFrame(model.predict(X_test), columns = ['y_pred']).join(pd.DataFrame(y_test).rename({'mpg':'y_true'},axis = 1))
        print(f"MSE : {((res['y_pred'] - res['y_true'])**2).mean()} ")
        print(f"MAE : {(abs(res['y_pred'] - res['y_true'])).mean()} ")

    return model

In [None]:
df.columns

In [None]:
model_prediction(df,
                ['cylinders', 'displacement', 'horsepower', 'weight','acceleration', 'model year', 'origin'],
                 train_test = False)

In [None]:
model_prediction(df,['displacement','horsepower','weight','acceleration','model year','origin'], train_test = True)