# Rice Clasiffication With Logistic and DT Regression

In [None]:
import pandas as pd
import matplotlib.pyplot       as plt
import seaborn                 as sns
import plotly.express          as ex
import plotly.graph_objs       as go
import plotly.offline          as py
import plotly.express as px
data=pd.read_csv(r'../input/rice-type-classification/riceClassification.csv')
data

# ***Some Data Cleaning***

In [None]:
# Id cells are not useful
data=data.drop(["id"],axis=1)


In [None]:
#Checking for Null Values
data.isna().sum()

**Since the Dataset is pretty well we will continue with the EDA**

# EDA

In [None]:
d = data.copy()
d["Class"] =  d["Class"].map({1:'Class : 1',0:'Class : 0'})
ex.pie(d,names='Class',title='Rice Classes')

**We see there is 54.9% of the data on class 1 and the class 0 45.1%. The data is balanced too.**

In [None]:
data["Class"].value_counts().plot(kind="bar")

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition  import PCA
N = 5 
pca_pipeline = Pipeline(steps = [
    ('scale',StandardScaler()),
    ('PCA',PCA(N))
])

tf_data = pca_pipeline.fit_transform(data.iloc[:,:9])
tf_data = pd.DataFrame({'PC1':tf_data[:,0],'PC2':tf_data[:,1],'PC3':tf_data[:,2],'PC4':tf_data[:,3],'PC5':tf_data[:,4],
                        'label':data.iloc[:,-1].map({0:'Class 0',1:'Class 1 '})})
ex.scatter_3d(tf_data,x='PC1',y='PC2',z='PC3',color='label',color_discrete_sequence=['blue','red'])

In [None]:
def var_distribution2(dataframe):
    import matplotlib.pyplot as plt
    numbers = pd.Series(dataframe.columns)
    dataframe[numbers].hist(figsize=(14,14))
    plt.show();
    return dataframe.var()
var_distribution2(data)

**We can see that the distribution in most variables tends to gaussian**

In [None]:
def Outliers(dataframe,cols):
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    numeric_col2=[]
    for x in cols:
        numeric_col2.append(x)

    fig=make_subplots(rows=1, cols=len(cols))

    for i,col in enumerate(numeric_col2):
        fig.add_trace(go.Box(y=dataframe[col].values, name=dataframe[col].name), row=1, col=i+1)

    return fig.show()

cols=data.columns.values.tolist()
Outliers(data,cols)

****We can see  outliers in some features, mostly is because the features have many values so we will take care of the farest outliers later****

In [None]:
data.groupby("Class").mean()

In [None]:
def corr(dataframe,target_variable):
    import seaborn as sns
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(figsize=(10,10))
    correlation_matrix = dataframe.corr().round(2)
    sns.heatmap(data=correlation_matrix, annot=True)
    
    correlation = data.corr()[target_variable].abs().sort_values(ascending = False)
    return correlation
corr(data,"Class")

**We can see that ,MinorAxisLength,AspectRation,Roundness,Area,ConvexArea,EquivDiameter and Eccentricity have a very high correlation with the target variable**

In [None]:
fig1=px.scatter(data_frame=data,x='Class', y='MinorAxisLength', color='Class', hover_name='MinorAxisLength')
fig1.update_layout(title=dict(text='MinorAxisLength vs Class', xanchor='center', yanchor='top', x=0.5))
fig1.show()


In [None]:
fig1=px.scatter(data_frame=data,x='Class', y='AspectRation', color='Class', hover_name='AspectRation')
fig1.update_layout(title=dict(text='AspectRation vs Class', xanchor='center', yanchor='top', x=0.5))
fig1.show()


In [None]:
fig1=px.scatter(data_frame=data,x='Class', y='Roundness', color='Class', hover_name='Roundness')
fig1.update_layout(title=dict(text='Roundness vs Class', xanchor='center', yanchor='top', x=0.5))
fig1.show()


In [None]:
fig1=px.scatter(data_frame=data,x='Class', y='Area', color='Class', hover_name='Area')
fig1.update_layout(title=dict(text='Area vs Class', xanchor='center', yanchor='top', x=0.5))
fig1.show()


In [None]:
fig1=px.scatter(data_frame=data,x='Class', y='ConvexArea', color='Class', hover_name='ConvexArea')
fig1.update_layout(title=dict(text='ConvexArea vs Class', xanchor='center', yanchor='top', x=0.5))
fig1.show()

In [None]:
fig1=px.scatter(data_frame=data,x='Class', y="EquivDiameter", color='Class', hover_name="EquivDiameter")
fig1.update_layout(title=dict(text="EquivDiameter vs Class", xanchor='center', yanchor='top', x=0.5))
fig1.show()

In [None]:
fig1=px.scatter(data_frame=data,x='Class', y='Eccentricity', color='Class', hover_name='Eccentricity')
fig1.update_layout(title=dict(text='Eccentricity vs Class', xanchor='center', yanchor='top', x=0.5))
fig1.show()

**We can see that with most of the features there is a negative relation with class 1**

In [None]:
sns.FacetGrid(data, hue="Class", height=6) \
   .map(sns.histplot, "MinorAxisLength") \
   .add_legend();
plt.suptitle("MinorAxisLength",size=28)
plt.show();

In [None]:
sns.FacetGrid(data, hue="Class", height=6) \
   .map(sns.histplot, "AspectRation") \
   .add_legend();
plt.suptitle("AspectRation",size=28)
plt.show();

In [None]:
sns.FacetGrid(data, hue="Class", height=6) \
   .map(sns.histplot, "Roundness") \
   .add_legend();
plt.suptitle("Roundness",size=28)
plt.show();

In [None]:
sns.FacetGrid(data, hue="Class", height=6) \
   .map(sns.histplot, "Area") \
   .add_legend();
plt.suptitle("Area",size=28)
plt.show();

In [None]:
sns.FacetGrid(data, hue="Class", height=6) \
   .map(sns.histplot, "ConvexArea") \
   .add_legend();
plt.suptitle("ConvexArea",size=28)
plt.show();

In [None]:
sns.FacetGrid(data, hue="Class", height=6) \
   .map(sns.histplot, "EquivDiameter") \
   .add_legend();
plt.suptitle("EquivDiameter",size=28)
plt.show();

In [None]:
sns.FacetGrid(data, hue="Class", height=6) \
   .map(sns.histplot, "Eccentricity") \
   .add_legend();
plt.suptitle("Eccentricity",size=28)
plt.show();

# Data Preprocessing

**Now we will remove those outliers we said about**

In [None]:
data=data[~(data["MajorAxisLength"]>=183.2144)]
data=data[~(data["MajorAxisLength"]<=77.41707)]
data=data[~(data["Eccentricity"]<=0.6798581)]
data=data[~(data["EquivDiameter"]<=58.25104)]
data=data[~(data["Perimeter"]<=200.587)]
data=data[~(data["Perimeter"]>=476.522)]
data=data[~(data["Roundness"]<=0.2992976)]

**We will use a func to select the best variables for the model**

In [None]:
def feature_selector(dataframe,feature_number,target_variable):
    from sklearn import datasets
    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression
    
    dataframe2=dataframe[target_variable]
    dataframe=dataframe.drop([target_variable],axis=1)
     
    n=feature_number
    lr = LogisticRegression(solver="liblinear")
    rfe=RFE(lr,n)
    rfe=rfe.fit(dataframe,dataframe2)
    cols=[]
    for x in dataframe.columns.values.tolist():
        cols.append(x)
    ranking=[]
    for x in rfe.ranking_:
        ranking.append(x)
    n=0
    for x in rfe.support_:
        print(f"{ranking[n]}: {x}----> {cols[n]}")
        n+=1
    selected=[]
    n=0
    z=zip(dataframe.columns.values.tolist(),rfe.support_)
    z=list(z)
    for x in range(len(z)+1):
    
        try:
            if str(z[n][1])=="True":
                selected.append(z[n])
            else:
                pass
        except IndexError:
            pass
        n+=1
    cols_selected=[]   
    
    for x,y in selected:
        cols_selected.append(x)
    
    if len(cols_selected)==feature_number:
        return  cols_selected
    else:
        print("ERROR!. Cols_Selected does not meet feature_number requirements")

feature_selector(data,8,"Class")

**We will check the vif of the features**

In [None]:
def VIF(dataframe,chosen_cols):
    from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
    from statsmodels.tools.tools import add_constant
    X=dataframe[chosen_cols]
    X=add_constant(X)
    vif_data=pd.DataFrame()
    vif_data["feature"]=X.columns
    vif_data["VIF"]=[VIF(X.values, i) for i in range(len(X.columns))]
    return vif_data
chosen_cols=['Area',
 'MajorAxisLength',
 'MinorAxisLength',
 'ConvexArea',
 'EquivDiameter',
 'Perimeter',
 'Roundness',
 'AspectRation']
VIF(data,chosen_cols)

**In this vif func, values above 10 must leave the model, here we have super high values. So we will do some cleaning**

In [None]:
new_chosen_cols=["MajorAxisLength","Roundness","Eccentricity","Extent"]
VIF(data,new_chosen_cols)

**We cleaned all the features with the highest VIF and added some whom werent chose by the feature_selector**

# Logistic Regression

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
logit_model= linear_model.LogisticRegression(max_iter=1000)
X=data[["MajorAxisLength","Roundness","Eccentricity","Extent"]]
Y=data["Class"]
logit_model.fit(X,Y)
print(f"R2 is {logit_model.score(X,Y)}")
scores= cross_val_score(linear_model.LogisticRegression(max_iter=1000),X,Y,scoring="accuracy",cv=10)
print(f"\nThe score mean with cross validation is {scores.mean()*100}%")

**We got an R2 of 98% and a 97% of cross val, wich is pretty good. But now we will divide the dataset in train and test**

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import pandas as pd
from sklearn import metrics
import numpy as np
X=data[["MajorAxisLength","Roundness","Eccentricity","Extent"]]
Y=data["Class"]
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.2,random_state=0)
lm=linear_model.LogisticRegression(max_iter=1000)
lm.fit(X_train,Y_train)
predict=lm.predict(X_test)
probs=lm.predict_proba(X_test)
prob=probs[:,1]
prob_df=pd.DataFrame(prob)
    

treshold = 0.25

    
print(f"Prediction Accuracy: {metrics.accuracy_score(Y_test,predict)}")
acc_lr=metrics.accuracy_score(Y_test,predict)
print()
prob_df["prediction"]= np.where(prob_df[0]> treshold,1,0)
print(prob_df)
print(f"\nTreshold: {treshold}")
print()
con_tab=pd.crosstab(prob_df["prediction"],columns="Count")
print(f"Number of Positive Cases: {con_tab.values[1]/len(prob_df)*100}%")
con_tab
    

**We got a prediction accuracy of 98%**

**Now let see the ROC curve**

In [None]:
from ggplot import *
from sklearn import metrics
  
    
especifities,sensibilities,_=metrics.roc_curve(Y_test,prob)
    
df=pd.DataFrame({
    "x":especifities,
    "y":sensibilities
})
    
auc=metrics.auc(especifities,sensibilities)
    
print(f"The AUC is: {auc}")
    
ggplot(df,aes(x="x",y="y"))+geom_line()+geom_abline(linetype="dashed")+xlim(-0.01,1.01)+ylim(-0.01,1.01)    
    


#  Now we will use the scond model, Decision Tree Classifier

In [None]:
def sklearn_decision_tree_clasiffier(dataframe,chosen_cols,target_variable,max_depth):
    from sklearn.tree import DecisionTreeClassifier
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import cross_val_score
    from sklearn.model_selection import KFold
    global predictor,X,Y,acc_dtc
    predictor=chosen_cols
    target= target_variable
    dataframe["is_train"] = (np.random.uniform(0,1,len(dataframe)) <= 0.75).astype(int)
    train,test=dataframe[dataframe["is_train"]==True],dataframe[dataframe["is_train"]==False]
    tree=DecisionTreeClassifier(criterion="entropy",min_samples_split=int((len(dataframe)/16)))
    tree.fit(train[predictor],train[target])
    preds=tree.predict(test[predictor])
    print(f"The R2 is {tree.score(train[predictor],train[target])}")
    print()
    X= dataframe[predictor]
    Y=dataframe[target]
    tree.fit(X,Y)
    cv= KFold(n_splits=100,shuffle=True)
    score=np.mean(cross_val_score(tree,X,Y,scoring="accuracy",cv=cv,n_jobs=1))
    print(f"The score for Cross Validation is : {score}")
    print("-----------------------------------------------------------------")
    
    
    return pd.crosstab(test[target],preds,rownames=["Actual"],colnames=["Predictions"])
    
    
    
    
    
chosen_cols=["MajorAxisLength","Roundness","Eccentricity","Extent"]
sklearn_decision_tree_clasiffier(data,chosen_cols,"Class",5)

**We got a 96% of acc. Not as good as the logistic**

**Now we will see its ROC curve**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from ggplot import *
X=data[["MajorAxisLength","Roundness","Eccentricity","Extent"]]
Y=data["Class"]
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.2,random_state=0)
tree=DecisionTreeClassifier(criterion="entropy",min_samples_split=int((len(data)/16)))

tree.fit(X_train,Y_train)
predict=tree.predict(X_test)
probs=tree.predict_proba(X_test)
prob=probs[:,1]
prob_df=pd.DataFrame(prob)
    

treshold = 0.25
from sklearn import metrics
  
    
especifities,sensibilities,_=metrics.roc_curve(Y_test,prob)
    
df=pd.DataFrame({
    "x":especifities,
    "y":sensibilities
})
    
auc=metrics.auc(especifities,sensibilities)
    
print(f"The AUC is: {auc}")
    
ggplot(df,aes(x="x",y="y"))+geom_line()+geom_abline(linetype="dashed")+xlim(-0.01,1.01)+ylim(-0.01,1.01)    
    


#  Thanks for watching. This is barely my second prediction kaggle and my first EDA. So i would love some feedback!.