In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

This Bee Colony data came from the U.S. Dept of Agriculture, here: [https://downloads.usda.library.cornell.edu/usda-esmis/files/rn301137d/nc5819380/sb397x676/hcny0820.txt](http://)
The webpage offers the data in 10 tables in an obscure format, and the **goal is to use Pandas to quickly get the data into a more usable form. **

The original data:
-needed to be copied from 10 tables and combined into one table in excel.  
-had a table for each three month range in the time frame.  The tables for APRIL-JUNE were fully null so I left that time frame out.  
-added a parameter for "Month Range" and "Year" so that all the data could fit into one table, and manually changed (Z) values to 0. 





![](https://imgur.com/a/j6va06o)

In [None]:
import pandas as pd 
import numpy as np
data = pd.read_csv(r'../input/bee-data-cleaningg/Bee Data.csv')
data.head(5)

The first column represents the number of colonies at the beginning of each month range. Changing the name to 'Colonies at start of month range'

From source: 
Maximum colonies 1/ = January 1 colonies plus all colonies moved into that state during the quarter. This will represent the max number of colonies per state during each month range.
Changing to column title to 'Maximum Colonies during month range'

From Source: 

Percent lost 2/ is the number of lost colonies divided by maximum colonies except for the United States.                                

Renovated colonies 3/ is  Defined as any surviving colony that was requeened or received new honey bees through nuc or package.        

Percent renovated 4/  is the number of renovated colonies divided by maximum colonies except for the United States, where percent renovated is the number of renovated colonies divided by the January 1 colonies. 

These column names make sense to keep, and I will simply take off the numbers. 

In [None]:
data = data.rename(columns = {'January 1 colonies':'Colonies_at_start','Maximum colonies 1/':'Maximum_colonies',
                             'Lost colonies':'Lost_colonies','Percent lost 2/':'Percent_lost','Added colonies':'Added_colonies','Renovated colonies 3/':'Renovated_colonies',
                             'Percent renovated 4/':'Percent_renovated','Percent with varroa mites':'Percent_with_varroa_mites',
                              'Percent with other pests':'Percent_with_other_pests','Percent with pesticides':'Percent_with_pesticides',
                              'Percent with other failures':'Percent_with_other_failures','Percent unknown failures':'Percent_unknown_failures',
                              'Percent with diseases':'Percent_with_diseases','Month Range':'Month_Range'})
data.head(3)

Removing periods  and the 5/ in "Other States 5/" from 'State' column:

In [None]:
#state column

data['State'] = data['State'].str.rstrip(' ........:')


data['State'] = data['State'].str.replace('5/','')

From source: 
(-) Represents zero.
(Z) Less than half of the unit shown.

From source: (X) = Not applicable. Dropping these rows.

Removing commas from numeric columns. 

In [None]:
#replacing values
data = data.replace('-','0', regex = True)
data['Month_Range'] = data['Month_Range'].str.replace('0','-')

data = data.replace('(0)','0', regex = True)
data = data.replace(',','', regex = True)

data.head(3)

Removing rows with NaN values

In [None]:
data = data.dropna()


Checking and changing column data types 

In [None]:
data.dtypes

In [None]:
data[['Colonies_at_start', 'Maximum_colonies',
      'Lost_colonies','Added_colonies','Renovated_colonies','Percent_renovated',
     'Percent_with_other_pests','Percent_with_diseases','Percent_with_pesticides',
      'Percent_unknown_failures']] = data[['Colonies_at_start', 'Maximum_colonies',
      'Lost_colonies','Added_colonies','Renovated_colonies','Percent_renovated',
     'Percent_with_other_pests','Percent_with_diseases','Percent_with_pesticides',
      'Percent_unknown_failures']].apply(pd.to_numeric)
data.dtypes

Still trying to figure out what to do with these month ranges. Making a concatenated column with month range and year in case that helps with analysis. We want to be able to differentiate by season and year when explaining the data. 

In [None]:

data['Year'] = data['Year'].map(str)
data['Month_Range'] = data['Month_Range'].map(str)
data.dtypes





# data['Months and year'] = data['Year'] + ' ' + data['Month Range']
# data.head(3)


In [None]:
data = data.infer_objects()
data.dtypes

In [None]:
#adding variable for quarter 
data['Month_range_and_year'] = data['Month_Range'] + ' ' + data['Year']

def Q(row):
    if row['Month_Range'] == 'January - March':
        val = 'Q1'
    elif row['Month_Range'] == 'April - June':
        val = 'Q2'
    elif row['Month_Range'] =='July - September':
        val = 'Q3' 
    else: val = 'Q4'
    return val 

data['Quarter'] = data.apply(Q, axis=1)


In [None]:
data.isna().sum()

In [None]:
data.to_csv('outputfile.csv', index=False)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt 
import matplotlib.patches as mpatch
import seaborn as sns
import numpy as np
sns.set_style('white')
state=data['State']
renovated=data['Renovated_colonies']
month_range=data['Month_Range']
c_at_start=data['Colonies_at_start']
lost=data['Lost_colonies']
added=data['Added_colonies']
months_year=data['Month_range_and_year']
fig, ax = plt.subplots(figsize=(12,6))

#renovated lost and added by state, need to get the bars to stack properly
ax.bar(state,renovated,color="grey", label=True, linewidth=1, alpha=0.5)
ax.bar(state,lost,color="red", label=True)
ax.bar(state,added,color="green",label=True)
ax.set_xlabel('States')
ax.set_ylabel('Colonies')
plt.xticks(rotation='vertical')
plt.tight_layout
y = np.arange(0,250000,20000)
plt.yticks(y)
plt.ticklabel_format(style='plain', axis='y')
plt.title('Bee Colonies by State')
# plt.gca().spines['top'].set_visible(False)
# plt.gca().spines['right'].set_visible(False)
r = mpatch.Patch(color='grey',label='Renovated')
l = mpatch.Patch(color='red',label='Lost')
a = mpatch.Patch(color='green',label='Added')
plt.legend(handles=[r,l,a])
plt.show()

#renovated by state
fig, ax = plt.subplots(figsize=(12,6))
plt.bar(state,renovated, linewidth=3,color='grey', alpha=.8)
ax.set_xlabel('States')
ax.set_ylabel('Renovated_Colonies')
plt.xticks(rotation='vertical')
plt.tight_layout
y = np.arange(0,250000,20000)
plt.yticks(y)
plt.ticklabel_format(style='plain', axis='y')
plt.title('Renovated Colonies by State')
plt.show()

#lost and added by timeframe
plt.bar(months_year,lost, linewidth=3, color='red',alpha=.4)
plt.title("Lost Bee Colonies by Season")
plt.tight_layout
plt.yticks(np.arange(0,500000,100000))
plt.xticks(rotation='vertical')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.show()

plt.bar(months_year,added, linewidth=3,color='green', alpha=.4)
plt.title("Added Bee Colonies by Season")
plt.tight_layout
plt.yticks(np.arange(0,500000,100000))
plt.xticks(rotation='vertical')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.show()



In [None]:
#lets look at possible correlations
data.dtypes
numeric_columns = data.select_dtypes(exclude = ['object'])

numeric_columns = pd.DataFrame(numeric_columns)
corr = numeric_columns.corr().round(3)
corr.style.background_gradient(cmap='coolwarm')

# numeric_columns2 = data[['Colonies at start of month range','Maximum colonies during month range',
#                          'Lost colonies','Percent lost','Added colonies','Renovated colonies']]
# corr2 = numeric_columns2.corr().round(3)
# corr2.style.background_gradient(cmap='coolwarm')


The output dataset will be much better for further analysis! 