In [None]:
import pandas as pd
import numpy as np
# ^^^ pyforest auto-imports - don't write above this line

In [None]:
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
path_train = '../input/crop-recommendation-dataset/Crop_recommendation.csv'
data = pd.read_csv(path_train)

In [None]:
data.info()

In [None]:
data

In [None]:
data['label'].value_counts()

### Process

In [None]:
data.rename(columns={'N':'nitrogen','P':'phosphorus','K':'potassium'}, inplace=True)

In [None]:
data.isnull().sum()

In [None]:
data_copy = data.round()

In [None]:
label_map = {j:i for i,j in enumerate(data['label'].unique())}

In [None]:
data_copy['label'] = data['label'].replace(label_map)

In [None]:
data_copy['potassium_bin'] = pd.cut(data_copy['potassium'],bins = 20)
data_copy['potassium_bin'] = data_copy['potassium_bin'].apply(lambda x : str(x))

In [None]:
data_copy['phosphorus_bin'] = pd.cut(data_copy['phosphorus'],bins = 7)
data_copy['phosphorus_bin'] = data_copy['phosphorus_bin'].apply(lambda x : str(x))

In [None]:
data_copy['nitrogen_bin'] = pd.cut(data_copy['nitrogen'],bins = 7)
data_copy['nitrogen_bin'] = data_copy['nitrogen_bin'].apply(lambda x : str(x))

In [None]:
data_copy['humidity_bin'] = pd.cut(data_copy['humidity'],bins = 8)
data_copy['humidity_bin'] = data_copy['humidity_bin'].apply(lambda x : str(x))

In [None]:
data_copy['temperature_bin'] = pd.cut(data_copy['temperature'],bins = 7)
data_copy['temperature_bin'] = data_copy['temperature_bin'].apply(lambda x : str(x))

In [None]:
data_copy['rainfall_bin'] = pd.cut(data_copy['rainfall'],bins = 9)
data_copy['rainfall_bin'] = data_copy['rainfall_bin'].apply(lambda x : str(x))

In [None]:
data_copy['label_'] = data['label']

In [None]:
fig = px.parallel_categories(data_copy[['label_','potassium_bin','phosphorus_bin','nitrogen_bin','ph']],color_continuous_scale= px.colors.sequential.Inferno)
fig.show()


In [None]:
fig = px.parallel_categories(data_copy[['label_','humidity_bin','temperature_bin','rainfall_bin']])
fig.show()


In [None]:
y = data.round().groupby('label').agg((['min', 'max']))
y

In [None]:
x = pd.DataFrame(columns=data.columns[:-1])
df = data.round()

for label in data['label'].unique():
    data_ = df[df['label'] == label].iloc[:,:-1]
    
    for i, col in enumerate(x.columns):
        max_ = data_[col].max()
        min_ = data_[col].min()

        x.loc[label,col] = f'{min_} - {max_}'

In [None]:
x

In [None]:
data

In [None]:
data['label_map'] = data['label'].replace(label_map)
data = data.round()

In [None]:
X = data.drop(columns = ['label','label_map'])

Y = data['label_map']

#### Multi-colinearity

In [None]:
def check_mutlicolinearity(data_x):
    corr = data_x.corr()
    corr = pd.DataFrame(np.tril(corr, k=-1),      # gets Lower triangular matrix
                        columns=data_x.columns,
                        index=data_x.columns)  

    corr = corr.replace(0.000000, np.NAN)
    count_of_total_correlation_values = corr.count().sum()

    for i in [0.5, 0.6, 0.7, 0.8, 0.9]:
        data_corr = corr[abs(corr) > i]
        count_greater_than_thresh = data_corr.count().sum()
        print(f'Percent Values Greater than {i} co-relation : {count_greater_than_thresh/count_of_total_correlation_values}')
    return corr

In [None]:
def plot_corr( corr,threshold = 0.5):
    data_corr = corr[abs(corr) > threshold]
    sns.heatmap(data_corr, annot=True, cmap="YlGnBu")
    plt.show()

In [None]:
corr = check_mutlicolinearity(X)

In [None]:
import seaborn as sns
plot_corr(corr)

#### Feature_Selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [None]:
fs = SelectKBest(score_func=f_classif, k='all')
fs.fit(X, Y)

In [None]:
fig = px.bar(x = X.columns, y =fs.scores_, template = 'plotly_dark',log_y=True)  ## Log-Graph
fig.show()

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state = 0)

### Models

In [None]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.metrics import classification_report,hamming_loss,roc_auc_score,confusion_matrix

In [None]:
def base_estimator(x_train, x_test, y_train, y_test,est = LogisticRegression):
#     x_train, x_test, y_train, y_test = train_test_split(data_X,data_Y,random_state = 0)
    
    if est == LogisticRegression:
        mod = est(random_state=0, multi_class = 'ovr', max_iter = 200)
        mod.fit(x_train.values, y_train.values)
    
    else:    
        mod = est(random_state=0)
        mod.fit(x_train.values, y_train.values)

    y_train_predict = mod.predict(x_train)
    y_test_predict = mod.predict(x_test)
    
    print(classification_report(y_train,y_train_predict))
    
    print('-'*50)
    print(classification_report(y_test,y_test_predict))
    

In [None]:
base_estimator(x_train, x_test, y_train, y_test, LogisticRegression )

In [None]:
base_estimator(x_train, x_test, y_train, y_test, RandomForestClassifier )

In [None]:
base_estimator(x_train, x_test, y_train, y_test, DecisionTreeClassifier )

In [None]:
model = LogisticRegression(random_state=0,n_jobs=-1, multi_class='ovr', max_iter=200)
model.fit(x_train, y_train)

In [None]:
y_predict_logit = model.predict(x_test)
print(classification_report(y_test,y_predict_logit,target_names = label_map.keys() ))

In [None]:
### params Checking

In [None]:
import plotly.graph_objs as go


In [None]:
df_coeff = pd.DataFrame(model.coef_, columns = X.columns, index = label_map.keys())
# df_coeff['intercept'] = model.intercept_ # uncomment to use intercept also

fig = go.Figure()
cols = df_coeff.columns
for index in df_coeff.index:
    fig.add_trace(go.Scatter(y = df_coeff.loc[index,:].values, x = cols,
                    mode='lines',
                    name= index))
fig.update_layout(template = 'plotly_dark')
fig.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from IPython.display import SVG
from graphviz import Source
from IPython.display import display                               
from ipywidgets import interactive

lab = data['label'].unique()
labels = X.columns
def plot_tree(crit, split, depth, min_split, min_leaf=1):
    estimator = DecisionTreeClassifier(random_state = 0 
          , criterion = crit
          , splitter = split
          , max_depth = depth
          , min_samples_split=min_split
          , min_samples_leaf=min_leaf)
    estimator.fit(X, Y)
    graph = Source(tree.export_graphviz(estimator
          , out_file=None
          , feature_names=labels
          , class_names=lab
          , filled = True))

    display(SVG(graph.pipe(format='svg')))
    return estimator

inter=interactive(plot_tree 
   , crit = ["gini", "entropy"]
   , split = ["best", "random"]
   , depth=[None, 1,2,3,4]
   , min_split=[2,1.0,0.5,0.1]
   , min_leaf=[1,2,0.1,0.5,])
display(inter)