In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
FILEPATH = '/kaggle/input/tamilnadu-cropproduction/Tamilnadu agriculture yield data.csv'

In [None]:
df = pd.read_csv(FILEPATH)

In [None]:
df.sample()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
list(df.columns)

In [None]:
df.isnull().any().any()

In [None]:
df.isnull().any()

In [None]:
import missingno as mino

In [None]:
mino.matrix(df)

In [None]:
mino.dendrogram(df)

In [None]:
mino.bar(df)

In [None]:
df.sample(3)

In [None]:
df.reset_index(inplace=True)

In [None]:
df.sample(3)

In [None]:
df['State_Name'].unique()

In [None]:
# We can remove the state as there is no use of it.

df = df.drop(['State_Name'], axis = 1)

In [None]:
df.sample(3)

In [None]:
df['District_Name'] = df['District_Name'].apply(lambda x: x.title())

In [None]:
df.dropna(how='any', inplace=True)

In [None]:
# Google Translator

!pip install googletrans

In [None]:
# clean up season

from googletrans import Translator

translator = Translator()

In [None]:
def convert_me(msg):
    translation = translator.translate(msg, dest='en')
    return(translation.text)

In [None]:
df['Season'].unique()

In [None]:
convert_me('Kharif')

In [None]:
convert_me('Rabi')

As Googletrans is not translating the word properly, we are using our own knowledge to fix is.

Rabi in Arabic means Spring. So, we use Spring season

In [None]:
season_map = {
    'Kharif' : 'Autumn',
    'Rabi' : 'Spring',
    'Whole Year' : 'Whole Year'
}

def convert_season(season):
    
    return season_map[season]

In [None]:
df['Season'] = df['Season'].apply(convert_season)

In [None]:
df.sample(5)

In [None]:
df.sample(3)

In [None]:
df['Production'].unique()

In [None]:
import plotly.express as px
import plotly.graph_objects as go

In [None]:
district_df = df[['District_Name', 'Production']]

In [None]:
district_df.sample(2)

In [None]:
district_df = pd.DataFrame(district_df['Production'].value_counts().head(10)).reset_index()

In [None]:
district_df.sample(3)

In [None]:
state_fig = go.Figure(data=[go.Pie(labels=district_df['index'],
                             values=district_df['Production'],
                             hole=.7,
                             title = 'Count by District',
                             marker_colors = px.colors.sequential.Blues_r,
                            )
                     ])
state_fig.update_layout(title = '% by District')
state_fig.show()

In [None]:
df.sample(2)

In [None]:
district_production_group = df.groupby("District_Name")["Production"].sum().sort_index(ascending=True)

In [None]:
dist_prod_df = pd.DataFrame({'District_Name': district_production_group.index,
                        'Production': district_production_group.values})

# dist_prod_df

In [None]:
import plotly.express as px

fig = px.bar(dist_prod_df, x="Production", y="District_Name", orientation='h', color = 'Production')
fig.show()

In [None]:
# Unique crop

df['Crop'].unique()

In [None]:
# Which district produce more Brinjal?

def show_production_by_crop_and_district(df, crop):
    
    df = df[df['Crop'] == crop]
    
    current_group = df.groupby("District_Name")["Production"].sum().sort_index(ascending=True)
    
    current_df = pd.DataFrame({'District_Name': current_group.index,
                        'Production': current_group.values})
    
    fig = px.bar(current_df, x="Production", y="District_Name", orientation='h', color = 'Production')
    fig.show()

In [None]:
show_production_by_crop_and_district(df, 'Grapes')

In [None]:
show_production_by_crop_and_district(df, 'Sunflower')

In [None]:
show_production_by_crop_and_district(df, 'Coconut')

In [None]:
show_production_by_crop_and_district(df, 'Tapioca')

In [None]:
df.sample(4)

### Single District Production Rate

In [None]:
def show_production_by_crop_and_dist(df, district):
    
    df = df[df['District_Name'] == district]
    
#     return df
    
    current_group = df.groupby("Crop_Year")["Production"].sum().sort_index(ascending=True)
    
    current_df = pd.DataFrame({'Crop_Year': current_group.index,
                        'Production': current_group.values})
    
    fig = px.bar(current_df, x="Production", y="Crop_Year", orientation='h', color = 'Production')
    fig.show()

In [None]:
show_production_by_crop_and_dist(df, 'Madurai')

In [None]:
import matplotlib.pyplot as plt

def show_donut_plot(col, max_cols = 10):
    
    rating_data = df.groupby(col)[['index']].count().head(max_cols)
    plt.figure(figsize = (12, 8))
    plt.pie(rating_data[['index']], autopct = '%1.0f%%', startangle = 140, pctdistance = 1.1, shadow = True)

    # create a center circle for more aesthetics to make it better
    gap = plt.Circle((0, 0), 0.5, fc = 'white')
    fig = plt.gcf()
    fig.gca().add_artist(gap)
    
    plt.axis('equal')
    
    cols = []
    for index, row in rating_data.iterrows():
        cols.append(index)
    plt.legend(cols)
    
    plt.title('Donut Plot by ' +str(col), loc='center')
    
    plt.show()

In [None]:
show_donut_plot('District_Name')

In [None]:
show_donut_plot('Crop_Year', 8)

In [None]:
import squarify

def show_treemap(col, max_labels = 10):
    
    df_type_series = df.groupby(col)['index'].count().sort_values(ascending = False).head(20)

    type_sizes = []
    type_labels = []
    for i, v in df_type_series.items():
        type_sizes.append(v)
        
        type_labels.append(str(i) + ' ('+str(v)+')')


    fig, ax = plt.subplots(1, figsize = (12,12))
    squarify.plot(sizes=type_sizes, 
                  label=type_labels[:max_labels],  # show labels for only first 10 items
                  alpha=.2 )
    
    plt.title('TreeMap: Count by '+ str(col))
    plt.axis('off')
    plt.show()

In [None]:
show_treemap('Crop_Year')

In [None]:
show_treemap('District_Name')

In [None]:
df.sample(2)

In [None]:
fig = px.sunburst(df, path=['District_Name', 'Crop_Year'], values='Production',
                  color='Production', hover_data=['Production'])
fig.show()

In [None]:
fig = px.sunburst(df, path=['Crop_Year', 'Season'], values='Production',
                  color='Production', hover_data=['Production'])
fig.show()

In [None]:
df['Crop_Year'].max()

In [None]:
last_4_years_df = df[df['Crop_Year'] > 2009]

In [None]:
fig = px.sunburst(last_4_years_df, path=['Crop_Year', 'Season'], values='Area',
                  color='Area', hover_data=['Area'])
fig.show()

In [None]:
fig = px.sunburst(last_4_years_df, path=['Crop_Year', 'Crop'], values='Area',
                  color='Area', hover_data=['Area'])
fig.show()

In [None]:
df.sample(2)

In [None]:
theni_df = df[df['District_Name'] == 'Theni']
theni_df = theni_df[theni_df['Crop_Year'] > 2009]

In [None]:
fig = px.sunburst(theni_df, path=['Crop_Year', 'Crop'], values='Area',
                  color='Area', hover_data=['Area'])
fig.show()

In [None]:
def show_crop_sunburtst_by_district(district = 'Theni'):
    
    current_df = df[df['District_Name'] == district]
    current_df = current_df[current_df['Crop_Year'] > 2009]
    
    fig = px.sunburst(current_df, path=['Crop_Year', 'Crop'], values='Area',
                  color='Area', hover_data=['Area'])
    fig.show()

In [None]:
show_crop_sunburtst_by_district('Madurai')

In [None]:
show_crop_sunburtst_by_district('Kanniyakumari')

**To do:**

* Add some documentation for each plot
* Add more visuals

**Final Notes:**

I am adding things still. You can come back and check for more information.

Also, if you **like my notebook**, <font style="color:blue;size:14px;">please upvote it</font> as it will motivate me to come up with better approach in the upcoming notebooks.

<font color="blue" size=+1.5><b>Check out my other kernels</b></font>

<table style="font-family: 'Trebuchet MS', Arial, Helvetica, sans-serif;border-collapse: collapse;width: 100%;">	
  <tr>
    <th style="border: 1px solid #ddd;padding: 8px; padding-top: 12px;padding-bottom: 12px;text-align: left;background-color: #2987E7;color: white;">Notebook</th>
    <th style="border: 1px solid #ddd;padding: 8px; padding-top: 12px;padding-bottom: 12px;text-align: left;background-color: #2987E7;color: white;">Tags</th>
  </tr>
  <tr>
    <td style="text-align: left"><a href="https://www.kaggle.com/rajacsp/reviews-with-nlp-wip">Amazon Alexa Reviews with NLP</a> </td>
    <td style="text-align: left">NLP, Spacy</td>
  </tr>
  <tr>
    <td style="text-align: left"><a href="https://www.kaggle.com/rajacsp/sof-questions-eda-and-visual">SOF Questions - EDA and Visual</a> </td>
    <td style="text-align: left">Data Visual, Plotly</td>
  </tr>
  <tr>
    <td style="background-color: #f2f2f2;text-align: left"><a href="https://www.kaggle.com/rajacsp/netflix-visualization-plotly-plots-treemap">Netflix - Visualization, Plotly, Plots, and Treemap</a> </td>
    <td style="background-color: #f2f2f2;text-align: left">Data Visual, Data Cleaning, Plotly</td>
  </tr>
  <tr>
    <td style="text-align: left"><a href="https://www.kaggle.com/rajacsp/prediction-with-various-algorithms">Prediction with various Algorithms</a> </td>
    <td style="text-align: left">Random Forest, Logistic Regression</td>
  </tr>
  <tr>
    <td style="background-color: #f2f2f2;text-align: left"><a href="https://www.kaggle.com/rajacsp/eda-and-visualization">EDA and Visualization</a> </td>
    <td style="background-color: #f2f2f2;text-align: left">Data Cleaning, Data Visual</td>
  </tr>
  <tr>
    <td style="text-align: left"><a href="https://www.kaggle.com/rajacsp/job-analysis-eda-visual">Job Analysis - EDA and Visual</a> </td>
    <td style="text-align: left">Data Visual, EDA, Plotly</td>
  </tr>   
  <tr>
    <td style="background-color: #f2f2f2;text-align: left"><a href="https://www.kaggle.com/rajacsp/estonia-disaster-visualization">Estonia Disaster - Visualization</a> </td>
    <td style="background-color: #f2f2f2;text-align: left">Data Visual, EDA, Data Cleaning</td>
  </tr>
    
  <tr>
    <td style="text-align: left"><a href="https://www.kaggle.com/rajacsp/pandas-dundas-challenge-100" >Pandas 100+ exercises collection</a></td>
    <td style="text-align: left">Pandas, Data Manipulation</td>
  </tr>   
  <tr>
    <td style="background-color: #f2f2f2;text-align: left"><a href="https://www.kaggle.com/rajacsp/prediction-with-various-algorithms">Credit Card Fraud - Prediction with various algorithms</a></td>
    <td style="background-color: #f2f2f2;text-align: left">Various ML Algorithms</td>
  </tr>  
  <tr>
    <td style="text-align: left"><a href="https://www.kaggle.com/rajacsp/linear-equations-real-time">Linear Equations - Real Time</a> </td>
    <td style="text-align: left">Linear Equation</td>
  </tr>  
</table>

