In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import warnings
warnings.filterwarnings("ignore") 
import pandas as pd
import calendar
import numpy as np

import random
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from deap import creator, base, tools, algorithms
import sys

In [None]:
def week2month(hep):
    hep['LastDayWeek'] = pd.to_datetime((hep['week']-1).astype(str) + "6", format="%Y%U%w")
    hep['MonthMax'] = pd.DatetimeIndex(hep['LastDayWeek']).month
    hep['Year'] = pd.DatetimeIndex(hep['LastDayWeek']).year
    hep['MonthName'] = [calendar.month_name[i] for i in hep.MonthMax]
    return hep

In [None]:
def get_input_transform(file_):
    return week2month(pd.read_csv(file_))

In [None]:
# load the dataset from Kaggle
hep = get_input_transform('../input/contagious-diseases/hepatitis.csv')
mea = get_input_transform('../input/contagious-diseases/measles.csv')
mum = get_input_transform('../input/contagious-diseases/mumps.csv')
per = get_input_transform('../input/contagious-diseases/pertussis.csv')
pol = get_input_transform('../input/contagious-diseases/polio.csv')
rub = get_input_transform('../input/contagious-diseases/rubella.csv')
sma = get_input_transform('../input/contagious-diseases/smallpox.csv')

In [None]:
# for our exploratory purpose we examine data for 1960 through to 2011 for US states
# combining all the disease datasets is shown below

train_data = hep
for i in [mea,mum,per,pol,rub,sma]:
    train_data = train_data.append(i)
train_data = train_data.loc[(train_data['Year'] >= 1960) & (train_data['Year'] <=2011)]

# examine the dataset 
train_data.head()

In [None]:
# some data discrepancies must be resolved (issues like \\N )
train_data_bad = train_data[train_data.cases==train_data.cases.astype(str).max()]
print(train_data_bad.head(10))


In [None]:
ind = list(train_data_bad.index)

In [None]:
train_data = train_data.drop(train_data.index[ind])

In [None]:
train_data.head(10)

In [None]:
# will be using visulisation.csv for our visualization purposes later

train_data.to_csv('for_visulisation.csv')

In [None]:
# examining the characteristics of the dataset
train_data.describe()
train_data.shape

In [None]:
# converting non-numerical data to lowercase (to keep consistencies over all future datasets included)

train_data.state_name = [i.lower() for i in train_data.state_name]
train_data.shape

In [None]:
join_tavg = pd.read_csv('../input/temperature/tavg_data.csv')
join_tavg.state_name = [i.lower() for i in join_tavg.state_name]

In [None]:
join_tavg.head(10)

In [None]:
result = pd.merge(train_data, join_tavg, on=['week', 'state_name'])
train_data = result

In [None]:
result.head(10)

In [None]:
train_data.head(10)

In [None]:
join_pcp = pd.read_csv("../input/precipitate/pcp.csv")
join_pcp.state_name = [i.lower() for i in join_pcp.state_name]

In [None]:
result = pd.merge(train_data, join_pcp, on=['week', 'state_name'])
train_data = result

In [None]:
train_data.head(10)

In [None]:
from sklearn.preprocessing import MinMaxScaler as mm

scaler = mm()

# avg temp deviations
scaler.fit(train_data['tavg_anomaly'].values.reshape(-1,1))
train_data['tavg_anomaly'] = scaler.transform(train_data['tavg_anomaly'].values.reshape(-1,1))

# avg temp
scaler.fit(train_data['t_avg'].values.reshape(-1,1))
train_data['t_avg'] = scaler.transform(train_data['t_avg'].values.reshape(-1,1))

# precipitation deviations
pcp_anomaly = scaler.fit_transform(train_data['pcp_anomaly'].values.reshape(-1,1))
train_data['pcp_anomaly'] = pcp_anomaly

# precipitation
scaler.fit(train_data['precipitate'].values.reshape(-1,1))
train_data['precipitate'] = scaler.transform(train_data['precipitate'].values.reshape(-1,1))

In [None]:
train_data.head()

In [None]:
train_data.to_csv('result.csv',index=False)

In [None]:
train_data.head(10)

In [None]:
train_data.duplicated(subset=None, keep='first').sum()

In [None]:
# remove the useless attributes from the dataset 

# store unique state_name and diseases for label encoding (dont drop them!)
states = np.asarray(train_data.state_name.unique())
#dis = np.unique(train_data['disease'].values)

week       = train_data.pop('week')
LOW        = train_data.pop('LastDayWeek')
monthN     = train_data.pop('MonthName')
state     = train_data.pop('state')

In [None]:
y = train_data.pop('disease')
X = train_data
print(X)

In [None]:
print(y)

In [None]:
# check dimensions

print(X.shape)
print(y.shape)

In [None]:
train_data.describe()

In [None]:
train_data.head(10)

In [None]:
# label encoding of useful non-numerical attributes

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(y)

y = le.transform(y)

le2 = LabelEncoder()
le2.fit(states)

X['state_name'] = le2.transform(X.state_name)

Genetic Algorithm for feature selection

In [None]:
def avg(l):
    """
    Returns the average between list elements
    """
    return (sum(l)/float(len(l)))

In [None]:
def getFitness(individual, X, y):
    """
    Feature subset fitness function
    """

    if(individual.count(0) != len(individual)):
        # get index with value 0
        cols = [index for index in range(
            len(individual)) if individual[index] == 0]

        # get features subset
        X_parsed = X.drop(X.columns[cols], axis=1)
        X_subset = pd.get_dummies(X_parsed)

        # apply classification algorithm
        clf = DecisionTreeClassifier(max_depth=10)
        #clf = SVC()

        return (avg(cross_val_score(clf, X_subset, y, cv=5)),)
    else:
        return(0,)

In [None]:
def geneticAlgorithm(X, y, n_population, n_generation):
    """
    Deap global variables
    Initialize variables to use eaSimple
    """
    # create individual
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMax)

    # create toolbox
    toolbox = base.Toolbox()
    toolbox.register("attr_bool", random.randint, 0, 1)
    toolbox.register("individual", tools.initRepeat,
                     creator.Individual, toolbox.attr_bool, len(X.columns))
    toolbox.register("population", tools.initRepeat, list,
                     toolbox.individual)
    toolbox.register("evaluate", getFitness, X=X, y=y)
    toolbox.register("mate", tools.cxOnePoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
    toolbox.register("select", tools.selTournament, tournsize=3)

    # initialize parameters
    pop = toolbox.population(n=n_population)
    hof = tools.HallOfFame(n_population * n_generation)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("min", np.min)
    stats.register("max", np.max)

    # genetic algorithm
    pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2,
                                   ngen=n_generation, stats=stats, halloffame=hof,
                                   verbose=True)

    # return hall of fame
    return hof

In [None]:
def bestIndividual(hof, X, y):
    """
    Get the best individual
    """
    maxAccurcy = 0.0
    for individual in hof:
        ind = individual.fitness.values
        if(ind[0] > maxAccurcy):
            maxAccurcy = ind[0]
            _individual = individual

    _individualHeader = [list(X)[i] for i in range(
        len(_individual)) if _individual[i] == 1]
    return _individual.fitness.values, _individual, _individualHeader

In [None]:
def getArguments():
    """
    Get argumments from command-line
    If pass only dataframe path, pop and gen will be default
    """
    dfPath = sys.argv[1]
    if(len(sys.argv) == 4):
        pop = int(sys.argv[2])
        gen = int(sys.argv[3])
    else:
        pop = 10
        gen = 2
    return dfPath, pop, gen


In [None]:
if __name__ == '__main__':
    # get dataframe path, population number and generation number from command-line argument 
    n_pop = 10
    n_gen = 15
    # read dataframe from csv
    #df = pd.read_csv(dataframePath, sep=',')

    # encode labels column to numbers
    #le = LabelEncoder()
    #le.fit(df.iloc[:, -1])
    #y = le.transform(df.iloc[:, -1])
    #X = df.iloc[:, :-1]

    # get accuracy with all features
    individual = [1 for i in range(len(X.columns))]
    print("Accuracy with all features: \t" +
          str(getFitness(individual, X, y)) + "\n")

    # apply genetic algorithm
    hof = geneticAlgorithm(X, y, n_pop, n_gen)

    # select the best individual
    accuracy, individual, header = bestIndividual(hof, X, y)
    print('Best Accuracy: \t' + str(accuracy))
    print('Number of Features in Subset: \t' + str(individual.count(1)))
    print('Individual: \t\t' + str(individual))
    print('Feature Subset\t: ' + str(header))

    print('\n\ncreating a new classifier with the result')

    # read dataframe from csv one more time
    #df = pd.read_csv(dataframePath, sep=',')

    # with feature subset
    X = X[header]

    clf = DecisionTreeClassifier(max_depth=10)
    #clf = SVC()

    scores = cross_val_score(clf, X, y, cv=5)
    print("Accuracy with Feature Subset: \t" + str(avg(scores)) + "\n")

In [None]:
pip install chart_studio

In [None]:
# using plotly for the beautiful plots 

import chart_studio
import pandas as pd

# login api for plotly (dont forget to sign up to plotly)
chart_studio.tools.set_credentials_file(username= 'ab-bh', api_key ='KeUFpD51Wy55BOfM9Czx')



In [None]:
import pandas as pd 
import numpy as np
import plotly.offline as py

def get_viz(the_yr_data, yr):
    py.init_notebook_mode(connected=True)


    for col in the_yr_data.columns:
        the_yr_data[col] = the_yr_data[col].astype(str)

    scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
                [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
    scl = [
            # Let first 10% (0.1) of the values have color rgb(0, 0, 0)
            [0, 'rgb(0, 0, 0)'],
            [0.1, 'rgb(0, 0, 0)'],

            # Let values between 10-20% of the min and max of z
            # have color rgb(20, 20, 20)
            [0.1, 'rgb(20, 20, 20)'],
            [0.2, 'rgb(20, 20, 20)'],

            # Values between 20-30% of the min and max of z
            # have color rgb(40, 40, 40)
            [0.2, 'rgb(40, 40, 40)'],
            [0.3, 'rgb(40, 40, 40)'],

            [0.3, 'rgb(60, 60, 60)'],
            [0.4, 'rgb(60, 60, 60)'],

            [0.4, 'rgb(80, 80, 80)'],
            [0.5, 'rgb(80, 80, 80)'],

            [0.5, 'rgb(100, 100, 100)'],
            [0.6, 'rgb(100, 100, 100)'],

            [0.6, 'rgb(120, 120, 120)'],
            [0.7, 'rgb(120, 120, 120)'],

            [0.7, 'rgb(140, 140, 140)'],
            [0.8, 'rgb(140, 140, 140)'],

            [0.8, 'rgb(160, 160, 160)'],
            [0.9, 'rgb(160, 160, 160)'],

            [0.9, 'rgb(180, 180, 180)'],
            [1.0, 'rgb(180, 180, 180)']
        ]
    data = [ dict(
            type='choropleth',
            colorscale = scl,
            autocolorscale = True,
            locations = the_yr_data['state'],
            z = the_yr_data['cases'].astype(float),
            zmin=0,
            zmax=500,
            locationmode = 'USA-states',
            text = the_yr_data['text'],
            marker = dict(
                line = dict (
                    color = 'rgb(255,255,255)',
                    width = 2
                )
            ),
            colorbar = dict(
                title = "Disease outbreak - cases in %d" %(yr)
            )
        ) ]

    layout = dict(
        title = '%d US Diseases Cases Found by State<br>(Hover for breakdown)' %(yr),
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)',
            ),
    )

    fig = dict( data=data, layout=layout )

    url = py.iplot( fig, validate=False)

def get_1yr_viz(yr):
    data = pd.read_csv('for_visulisation.csv')
    the_yr_data = data.loc[data['Year'] ==yr]
    from collections import defaultdict
    har = defaultdict(set)
    a = list(the_yr_data.state_name)
    b = list(the_yr_data.disease)
    #print len(a),len(b)
    for i in range(len(a)):
        har[a[i]].add(b[i])
    
    the_yr_data['disease_all'] = [' '.join(list(har[i])) for i in the_yr_data.state_name]
    
    the_yr_data['text'] = the_yr_data['state_name'] + '<br>' +\
    'Disease '+the_yr_data['disease_all']
    
    tf = the_yr_data.filter(['state_name','state','cases'], axis=1)
    tf.cases = tf.cases.astype(int)
    the_yr_data_2 = tf.groupby(['state_name','state']).sum().reset_index()
    the_yr_data_2['disease_all'] = [' '.join(list(har[i])) for i in the_yr_data_2.state_name]
    the_yr_data_2['text'] = the_yr_data_2['state_name'] + '<br>' +\
    'Disease '+the_yr_data_2['disease_all']
    
    get_viz(the_yr_data_2, yr)

In [None]:
get_1yr_viz(2011)

In [None]:
get_1yr_viz(1970)