In [None]:
# Import packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import linear_model
from sklearn import model_selection
from sklearn import metrics

In [None]:
# Retrieve the data
mpg = sns.load_dataset('mpg')
mpg.head()

In [None]:
# Descriptive statistics
mpg.describe()

In [None]:
pd.crosstab(index=mpg['origin'], columns='count')

In [None]:
pd.crosstab(index=mpg['origin'], columns=mpg['cylinders'])

In [None]:
# Descriptive statistics by group
mpg.groupby('origin').describe()['mpg']

In [None]:
# Histogram of mpg
sns.displot(data=mpg, x='mpg')
plt.savefig('aina_1301.png', dpi=300)

In [None]:
# Box plot of mpg by origin
sns.boxplot(x='origin', y='mpg', data=mpg, color='pink')
plt.savefig('aina_1302.png', dpi=300)

In [None]:
sns.displot(data=mpg, x="mpg", col="origin")
plt.savefig('aina_1303.png', dpi=300)

In [None]:
usa_cars = mpg[mpg['origin']=='USA']
europe_cars = mpg[mpg['origin']=='Europe']

In [None]:
stats.ttest_ind(usa_cars['mpg'], europe_cars['mpg'])

In [None]:
mpg[['mpg','horsepower','weight']].corr()

In [None]:
# Scatterplot of weight and mpg, add a label
sns.scatterplot(x='weight', y='mpg', data=mpg)
plt.title('Relationship between weight and mileage')

In [None]:
sns.pairplot(mpg[['mpg','horsepower','weight']])

In [None]:
# Linear regression of weight on mpg
stats.linregress(x=mpg['weight'], y=mpg['mpg'])

In [None]:
sns.regplot(x="weight", y="mpg", data=mpg)
plt.xlabel('Weight (lbs)')
plt.ylabel('Mileage (mpg)')
plt.title('Relationship between weight and mileage')

plt.savefig('aina_1306.png', dpi=300)

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(mpg[['weight']], mpg[['mpg']], random_state=1234)

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

# Print first five observations 
y_pred[:5]

In [None]:
# Get the training model coefficients
regr.coef_

In [None]:
metrics.r2_score(y_test, y_pred)

In [None]:
metrics.mean_squared_error(y_test, y_pred)