In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/openintro-possum/possum.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(10,9))
sns.heatmap(df.corr(),annot=True)

In [None]:
sns.lmplot(x="hdlngth",y="taill",data=df,hue="sex",fit_reg=False)

In [None]:
plt.figure(figsize=(14,4))
plt.subplot(1,4,1)
sns.kdeplot(x="hdlngth",data=df[df["sex"] == "f"],shade=True,label="f")
sns.kdeplot(x="hdlngth",data=df[df["sex"] == "m"],shade=True,label="m")
plt.legend()
plt.title("Head length distributions by sex")

plt.subplot(1,4,2)
sns.kdeplot(x="taill",data=df[df["sex"] == "f"],shade=True,label="f")
sns.kdeplot(x="taill",data=df[df["sex"] == "m"],shade=True,label="m")
plt.legend()
plt.title("Tail distributions by sex")

plt.subplot(1,4,3)
sns.kdeplot(x="footlgth",data=df[df["sex"] == "f"],shade=True,label="f")
sns.kdeplot(x="footlgth",data=df[df["sex"] == "m"],shade=True,label="m")
plt.legend()
plt.title("Foot length distributions by sex")

plt.subplot(1,4,4)
sns.kdeplot(x="chest",data=df[df["sex"] == "f"],shade=True,label="f")
sns.kdeplot(x="chest",data=df[df["sex"] == "m"],shade=True,label="m")
plt.legend()
plt.title("Chest size distributions by sex")
plt.tight_layout()

In [None]:
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
sns.regplot(x="hdlngth",y="skullw",data=df,line_kws={"color":"red"})
plt.xlabel("Head Length")
plt.ylabel("Skull Width")

plt.subplot(1,2,2)
sns.regplot(x="hdlngth",y="chest",data=df,line_kws={"color":"red"})
plt.xlabel("Head Length")
plt.ylabel("Chest Size")

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,
           cmap="viridis")

In [None]:
df.head()

In [None]:
print("Pop unique values: {}".format(df["Pop"].unique()))
print("Sex unique values: {}".format(df["sex"].unique()))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Preprocessing Data
def prepro(df):
    df = df.copy()
    
    # Replace str values with binary values
    df["sex"] = df["sex"].replace({"m":1,"f":0})
    df["Pop"] = df["Pop"].replace({"Vic":1,"other":0})
    
    # Filling null values with columns mean values
    for col in df.columns:
        df[col].fillna(df[col].mean(),inplace=True)
    
    # Splitting and scaling dataset
    X = df.drop("totlngth",axis=1)
    y = df["totlngth"]
    
    scaler = StandardScaler()
    scaler.fit(X)
    
    # Creating a dataframe containing our tranformed target variables
    X = pd.DataFrame(scaler.transform(X),index=X.index,columns=X.columns)
    
    return train_test_split(X,y,test_size=0.3,shuffle=True)
    
    
X_train,X_test,y_train,y_test = prepro(df)
    
    

In [None]:
# Importing regression models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

rfmod = RandomForestRegressor()
linmod = LinearRegression()

rfmod.fit(X_train,y_train)
linmod.fit(X_train,y_train)

In [None]:
# Visually comparing model performance
p =linmod.predict(X_test)
preds = rfmod.predict(X_test)


plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plt.scatter(y_test, p)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Linear Regression Model")

plt.subplot(1,2,2)
plt.scatter(y_test, preds)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Random Forest Regressor Model")

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
print("Random Forest Regressor Score: {}".format(r2_score(y_test,preds)))
print("Linear Regression Score: {}".format(r2_score(y_test,p)))

After running each model several times and getting a fairly wide range of R^2 scores, I believe it would be best to compare scores over a number of splits and tests

In [None]:
lr2 = []
rfr2 = []
for i in range(1,21):
    
    X_train,X_test,y_train,y_test = prepro(df)
    
    linmod = LinearRegression()
    rfmod = RandomForestRegressor()
    
    linmod.fit(X_train,y_train)
    rfmod.fit(X_train,y_train)
    
    lr2.append(linmod.score(X_test,y_test))
    rfr2.append(rfmod.score(X_test,y_test))

In [None]:
plt.figure(figsize=(12,5))
plt.plot(range(1,21),lr2,marker="o",ls="--",label="Linear Model")
plt.plot(range(1,21),rfr2,marker="o",label="Random Forest Model")
plt.ylabel("R^2 Score")
plt.xlabel("Split, Fit No.")
plt.legend(loc="lower right")

In [None]:
print("Linear Model Mean R^2 Score: {:.3f}".format(np.array(lr2).mean()))
print("Random Forest Model Mean R^2 Score: {:.3f}".format(np.array(rfr2).mean()))

Our Linear Model seems to consistently outperform our random forest model.

In [None]:
predictions = linmod.predict(X_test)

In [None]:
# Residuals
sns.histplot(y_test-predictions,bins=8)

In [None]:
print("Mean Absolute Error: {:.3f}".format(mean_absolute_error(y_test,predictions)))