In [None]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
import joblib
%matplotlib inline

In [None]:
df = pd.read_csv('../input/meatconsumption/meat_consumption_worldwide.csv')
df.shape

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
def plot_barh(y, x, title=None, xlabel=None, ylabel=None,figsize=(10,10),style='fivethirtyeight', color='blue', write_num=False):
    plt.style.use(style)
    fig, ax = plt.subplots(figsize=figsize)
    
    plt.barh(y, x, height=0.75, color=color)
    plt.gca().invert_yaxis()
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    if write_num:
        for i, v in enumerate(x):
            ax.text(v, i + .25, str(v))
    plt.show()


## Countries 

In [None]:
location_count = df['LOCATION'].value_counts()
country_code, count = location_count.keys(),location_count.values

plot_barh(country_code, count, write_num=True, figsize=(18,18), xlabel='Frequency', ylabel='Country Code', title="Country and its Frequency")

## Types of meat consumption

In [None]:
df['SUBJECT'].unique()

In [None]:
sub_keys = df['SUBJECT'].value_counts().keys()
plt.figure(figsize=(10,10))
df['SUBJECT'].value_counts().plot(kind='pie', labels=sub_keys,autopct="%.1f", shadow=True, title='Types of meat consumption')
plt.show()

## Country and it's meat consumption

In [None]:
for country in country_code:
    subject_of_this_country = df['SUBJECT'][df['LOCATION'] == country]
    
    meats = subject_of_this_country.value_counts().keys()
    total = subject_of_this_country.value_counts().sum()
    
    subject_of_this_country.value_counts().plot(kind='pie',labels=meats, autopct="%.1f", shadow=True,title= f'Meat consumption of {country} ({total})')
    plt.show()
    print('\n')

## Total Value of each year 

In [None]:
years = sorted(df['TIME'].unique())

In [None]:
total_VALUE_per_year = []
for year in years:
    total_VALUE_per_year.append(df['Value'][df['TIME'] == year].sum())

In [None]:
year_and_total_value = pd.DataFrame({'Year':years,'total_value':total_VALUE_per_year})

In [None]:
year_and_total_value

In [None]:
plt.figure(figsize=(10,10))
sns.lineplot(data=year_and_total_value, x='Year',y='total_value',linewidth=1,color='r')
plt.show()

## Measurements Used

In [None]:
df['MEASURE'].unique()

In [None]:
df['MEASURE'].value_counts()

In [None]:
plt.figure(figsize=(10,10))
measure_keys = df['MEASURE'].value_counts().keys()
df['MEASURE'].value_counts().plot(kind='pie', labels=measure_keys, autopct="%.1f")
plt.show()

In [None]:
plt.figure(figsize=(10,10))
df['Value'].plot(kind='hist', rwidth=0.95)
plt.show()

## One Hot encoding

In [None]:
cols_to_encode = df.columns[:3]
new_df = pd.get_dummies(df[cols_to_encode],drop_first=True)

In [None]:
new_df.head()

## Scaling the year column 

In [None]:
scaler = MinMaxScaler()
scaler.fit(df[['TIME']])

In [None]:
scalled_years = scaler.fit_transform(df[['TIME']]).flatten()

In [None]:
scalled_years[:5]

In [None]:
new_df['TIME'] = scalled_years
new_df['Value'] = df['Value']

In [None]:
new_df.shape

In [None]:
new_df.head()

## Splitting the data 

In [None]:
x, y = new_df.drop('Value', axis=1), new_df['Value']

In [None]:
x.shape, y.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

In [None]:
x_train.shape, x_test.shape

In [None]:
y_train.shape, y_test.shape

## Model Building and  Predictions

In [None]:
models = [LinearRegression(), Lasso(), Ridge()]

for model in models:
    print("Model:", model)
    this_model = model
    this_model.fit(x_train, y_train)
    print("Score:",this_model.score(x_test, y_test))

## Using KNeighborsRegressor 

In [None]:
kn_model = KNeighborsRegressor()
kn_model.fit(x_train, y_train)
kn_model.score(x_test, y_test)

In [None]:
kn_model.score(x_train, y_train)

In [None]:
y_pred_test = kn_model.predict(x_test)
y_pred_train = kn_model.predict(x_train)

In [None]:
test = pd.DataFrame({
    'Y test': y_test,
    'Y predicted test': y_pred_test
})

train = pd.DataFrame({
    'Y train': y_train,
    'Y predicted train': y_pred_train
})

In [None]:
test.sample(7)

In [None]:
train.sample(7)

In [None]:
train.sample(7)

In [None]:
test.corr()

In [None]:
train.corr()

In [None]:
mean_absolute_error(y_test,y_pred_test)

In [None]:
mean_absolute_error(y_train,y_pred_train)

# **Compare actual and predicted**

In [None]:
test.sort_values(by= ['Y test'], inplace=True)
train.sort_values(by= ['Y train'], inplace=True)

In [None]:
def compare(df, title=None):
    col1, col2 = df.columns 
    plt.figure(figsize=(10,10))
    plt.scatter(df[col1], df[col2], color='b')
    plt.title(title)
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.show()

In [None]:
compare(test, 'Test Comparision')

In [None]:
compare(train, 'Train Comparision')

# **Saving model as file**

In [None]:
joblib.dump(kn_model, 'Meat-comsumption-Model')

In [None]:
!ls