In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
avocado_data = pd.read_csv("/kaggle/input/avocado-prices/avocado.csv")
avocado_data.head()

# Deleting columns, namely Unnamed and Date

In [None]:
avocado_data = avocado_data.drop(["Unnamed: 0", "Date"], axis=1) 

In [None]:
avocado_data = avocado_data.rename(columns={"4046": "small", "4225": "big", "4770": "very big"})
avocado_data.head(5)

# Handling missing data

In [None]:
avocado_data = avocado_data.replace(0.0, np.nan)
missing_values_count = avocado_data.isnull().sum() 
print("Length of data: ", len(avocado_data))
missing_values_count 

In [None]:
avocado_data.head(5)

In [None]:
columns = ['small', 'big', 'very big', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags'] 

medians = []

for c in columns:
    medians.append(avocado_data[c].median())
    
medians

In [None]:
i = 0
for c in columns:
    avocado_data[c] = avocado_data[c].fillna(medians[i])
    i += 1
    
avocado_data.head(5)

# Filled all Nan data with the median of columns

In [None]:
avocado_data = avocado_data.replace(0.0, np.nan)
missing_values_count = avocado_data.isnull().sum() 
print("Length of data: ", len(avocado_data))
missing_values_count 

# Plotting average price

In [None]:
import seaborn as sns

sns.distplot(avocado_data['AveragePrice']); 

In [None]:
import matplotlib.pyplot as plt

f,ax=plt.subplots(figsize=(10,9))
sns.heatmap(avocado_data.corr(),annot=True,fmt='.2f',ax=ax,vmin=-1, vmax=1, center= 0, cmap= 'coolwarm',linewidths=3, linecolor='black')
plt.show()

In [None]:
import plotly.express as px

fig = px.scatter(avocado_data, x='AveragePrice', y='Total Volume',
                 color='type') # Added color to previous basic 
fig.update_layout(title='Average Price Vs Volume with Avocado Type ',xaxis_title="Price",yaxis_title="Volume")
fig.show()

# Preprocessing data

In [None]:
label_cols = ['type','region']

from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
avocado_data[label_cols] = avocado_data[label_cols].apply(lambda x : label.fit_transform(x)) 

# Splitting data into train and test

In [None]:
X = avocado_data.drop(['AveragePrice'],axis=1)
y = avocado_data["AveragePrice"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [None]:
from sklearn.compose import make_column_transformer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder

scaler = StandardScaler()
ohe = OneHotEncoder() 

scale_cols = avocado_data.drop(['AveragePrice','type','year','region'], axis=1).columns
col_trans = avocado_data[label_cols].columns

# X_train, X_test, y_train, y_test
scaled_columns  = scaler.fit_transform(X_train[scale_cols]) 
encoded_columns = ohe.fit_transform(X_train[col_trans])  
X_train = np.concatenate([scaled_columns, pd.DataFrame(encoded_columns.toarray())
], axis=1)  

scaled_columns  = scaler.fit_transform(X_test[scale_cols]) 
encoded_columns = ohe.fit_transform(X_test[col_trans])  
X_test = np.concatenate([scaled_columns, pd.DataFrame(encoded_columns.toarray())
], axis=1)  

# The model

In [None]:
import tensorflow as tf 

avocado_model = tf.keras.Sequential([  
    tf.keras.layers.Dense(32, activation='relu', input_shape = X_train.shape[1:]),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(.2),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(.2),
    tf.keras.layers.Dense(1),
]) 

avocado_model.compile(loss='mse', optimizer='sgd')
avocado_model.summary()

In [None]:
num_epochs = 30

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True) 

history = avocado_model.fit(
    X_train, y_train,
    validation_split=0.15, 
    epochs=num_epochs, 
)

# Plotting results

In [None]:
import matplotlib.pyplot as plt 

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_' + string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()
   
plot_graphs(history, "loss")  

In [None]:
print("Evaluate on test data")
results = avocado_model.evaluate(X_test, y_test)
print("test loss, test acc:", results) 

# Predictions

In [None]:
prediction = avocado_model.predict(X_test[:5])
print("prediction: ", prediction.tolist()) 
print("test: ",  y_test[:5].tolist())