In [None]:
####first we import all the  libraries that we are going to use 
#libraries for preproccesing and linar algebra
import numpy as np
import pandas as pd 
import os
#libraries for deep learning :we are going to use FASTAI as a framework on top of pytorch 
import torch
from fastai.vision import *
from fastai.metrics import error_rate
#visualization library
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

In [None]:
####phase1:IMPORTATION,EXPLORATORY DATA ANALYSIS AND PRE PROCESSING  
traindf = pd.read_csv("train.csv")
traindf.head()

In [None]:
#checking train dataset columns and data types
traindf.info()

In [None]:
#some basic statistics on train dataset 
traindf.describe()

In [None]:
#exloring the data 
classdata = (traindf.healthy + traindf.multiple_diseases+
             traindf.rust + traindf.scab)
classdata.head()

In [None]:
any(classdata > 1)
#---->this means that is problem is is not a multiclassification problem 
#since all examples falls under only one of the 4 classes

In [None]:
#adding .jpg to help us load images later on 
traindf["image_id"] =traindf["image_id"].astype("str") + ".jpg"
traindf.head()

In [None]:
#now  lets define our classes to be:
# 0 for healthy
# 1 multiple_diseases
# 2 rust
# 3 scab
traindf["label"] = (0*traindf.healthy + 1*traindf.multiple_diseases+
             2*traindf.rust + 3*traindf.scab)
traindf.drop(columns=["healthy","multiple_diseases","rust","scab"],inplace=True)
traindf.head()

In [None]:
##some visual EDA to understand our data more 
#checking class unbalance
train_data = pd.read_csv("train.csv")
fig = go.Figure([go.Pie(labels=train_data.columns[1:],
           values=train_data.iloc[:, 1:].sum().values)])
fig.update_layout(title_text="Pie chart of targets", template="simple_white")
fig.data[0].marker.line.color = 'rgb(0, 0, 0)'
fig.data[0].marker.line.width = 0.5
fig.show()

In [None]:
#distribution of healthy class
train_data["Healthy"] = train_data["healthy"].apply(bool).apply(str)
fig = px.histogram(train_data, x="Healthy", title="Healthy distribution", color="Healthy",\
            color_discrete_map={
                "True": px.colors.qualitative.Plotly[0],
                "False": px.colors.qualitative.Plotly[1]})
fig.update_layout(template="simple_white")
fig.data[0].marker.line.color = 'rgb(0, 0, 0)'
fig.data[0].marker.line.width = 0.5
fig.data[1].marker.line.color = 'rgb(0, 0, 0)'
fig.data[1].marker.line.width = 0.5
fig



In [None]:
#scab class distribution 
train_data["Scab"] = train_data["scab"].apply(bool).apply(str)
fig = px.histogram(train_data, x="Scab", color="Scab", title="Scab distribution",\
            color_discrete_map={
                "True": px.colors.qualitative.Plotly[1],
                "False": px.colors.qualitative.Plotly[0]})
fig.update_layout(template="simple_white")
fig.data[0].marker.line.color = 'rgb(0, 0, 0)'
fig.data[0].marker.line.width = 0.5
fig.data[1].marker.line.color = 'rgb(0, 0, 0)'
fig.data[1].marker.line.width = 0.5
fig

In [None]:
#rust distribution
train_data["Rust"] = train_data["rust"].apply(bool).apply(str)
fig = px.histogram(train_data, x="Rust", color="Rust", title="Rust distribution",\
            color_discrete_map={
                "True": px.colors.qualitative.Plotly[1],
                "False": px.colors.qualitative.Plotly[0]})
fig.update_layout(template="simple_white")
fig.data[0].marker.line.color = 'rgb(0, 0, 0)'
fig.data[0].marker.line.width = 0.5
fig.data[1].marker.line.color = 'rgb(0, 0, 0)'
fig.data[1].marker.line.width = 0.5
fig

In [None]:
#multiple deseases distribution
train_data["Multiple diseases"] = train_data["multiple_diseases"].apply(bool).apply(str)
fig = px.histogram(train_data, x="Multiple diseases", color="Multiple diseases", title="Multiple diseases distribution",\
            color_discrete_map={
                "True": px.colors.qualitative.Plotly[1],
                "False": px.colors.qualitative.Plotly[0]})
fig.update_layout(template="simple_white")
fig.data[0].marker.line.color = 'rgb(0, 0, 0)'
fig.data[0].marker.line.width = 0.5
fig.data[1].marker.line.color = 'rgb(0, 0, 0)'
fig.data[1].marker.line.width = 0.5
fig

In [None]:
#Creation of transformation object this object help
#augment our data by making transormations like flipping images and rotation...
#since more data means better results ;) (usually)
transformations = get_transforms(do_flip = True,
                                 flip_vert=True, 
                                 max_lighting=0.1, 
                                 max_zoom=1.05,
                                 max_warp=0.,
                                 max_rotate=15,
                                 p_affine=0.75,
                                 p_lighting=0.75
                                )

In [None]:
#this object is an encapsulation of our data it is a necessay step for the data to fit in a model under FASTAI
pathofdata = "/input/"
data  = ImageDataBunch.from_df(path=pathofdata, 
                               df=traindf, 
                               folder="images",
                               label_delim=None,
                               valid_pct=0.2,
                               seed=100,
                               fn_col=0, 
                               label_col=1, 
                               suffix='',
                               ds_tfms=transformations, 
                               size=512,
                               bs=64, 
                               val_bs=32,
                               )

In [None]:
#some images and their correspanding classes
data.show_batch(rows=3, figsize=(10,7))

In [None]:
#normalizing is a necessay pre proccessing step to make the model generalize better on the data
#it carries out a simple function on eacch image:subtract the mean of pixels and divide by the variance
data = data.normalize()

In [None]:
####PHASE 2 :MODELING AND TRAINNING
#our model in a CNN architecture specificly a resnet34 a commun architecture used in computer vision tasks 
learner = cnn_learner(data, 
                      models.resnet34, 
                      pretrained=True
                      ,metrics=[error_rate, accuracy],).to_fp16()
learner.model_dir = '/models'

In [None]:
#we set the hyperparameter leraning rate to be 0.002 (a value signifying how much we should update the weights at each iteration)
#after trying out a bunch of values 
#this value seems to work the best
#also we set epochs to be 10 due to time constraint 
#epachs is how many times does the model go through the whole dataset
lr = 0.002
epochs=10
#now we fit the resnet34 to our data 
#it takes about 50 minutes on kaggle (please activate GPU accelerator if running on kaggle)
learner.fit_one_cycle(epochs, lr)
#this saves the weights of the   model so that you can use it later on without trainning the model again

In [None]:
#uncomment these lines to load pretrained model 
learner.export()
learner = load_learner(path="/models")

In [None]:
#this shows a batch of the model predictions on the train dataset 
learner.show_results()

In [None]:
#### PHASE 3 RUNNING THE MODEL ON THE TEST DATASET 
#FIRST we import the test dataset
testdf = pd.read_csv("/test.csv")
testdf.head()

In [None]:
#now lets get the paths of all test dataset images
pathofdata = "/"
testdata= ImageList.from_folder(pathofdata+"images")
testdata.filter_by_func(lambda x: x.name.startswith("Test"))

In [None]:
#reading an image and predict its  classe with our model
img1 = open_image(testdata.items[0])
learner.predict(img1)
#--->the result is 3 which is the label of the scab desease

In [None]:
#### PHASE 4 PREPARING SUBMISSION FILE TO CHECK  OUR ACCURACY ON UNSEEN DATA 
#SUBMISSION DATAFRAME
resultlist = []
for item in testdata.items:
    img = open_image(item)
    predval = learner.predict(img)[2].tolist()
    predval.insert(0,item.name[:-4:])
    resultlist.append(predval)
resultdf = pd.DataFrame(resultlist)
resultdf.columns = sampsubmit.columns
resultdf.set_index("image_id",inplace=True)
resultdf = resultdf.loc[sampsubmit.image_id,:]
resultdf.reset_index(inplace=True)
resultdf.head()

In [None]:
#PREPARE SUBMISSION CSV YOU CAN FIND IT IN OUTPUT FOLDER ON THE RIGHT 
resultdf.to_csv("submit.csv",index=False)
#AFTER SUBMITING OUR RESULTS TO KAGGLE THE MODEL 
#PERFORMED OUTSTANDINGLY WELL SCORING 0.94405% OF ACCURACY ON UNSEEN TEST DATASET

'''THANK YOU 
END OF THE KERNEL'''