In [None]:
# We'll need these libraries
import numpy as np
import pandas as pd 
from pandas import read_csv

# Plotting libraries
import seaborn as sns
from ggplot import *

recipes = read_csv("../input/epirecipes/epi_r.csv")
bikes = read_csv("../input/nyc-east-river-bicycle-crossings/nyc-east-river-bicycle-counts.csv")
weather = read_csv("../input/szeged-weather/weatherHistory.csv")

In [None]:
recipes = recipes[recipes['calories'] < 10000].dropna()

In [None]:
# We'll use the numpy isreal() function
# See https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.isreal.html
print("Is this variable numeric?")
all(recipes['rating'].apply(np.isreal)) # Check that every row is True.

In [None]:
print("Is this variable only integers?")
all(recipes['rating'] == recipes['rating'].astype(int))

In [None]:
# plot calories by whether or not it's a dessert
ggplot(recipes, aes(x='calories', y='dessert')) + geom_point()

In [None]:
sns.set(style="darkgrid")
g = sns.regplot(x="calories", y="dessert", data=recipes, fit_reg=False)
g.figure.set_size_inches(8, 8)

Since whether or not something is a dessert is a categorical variable, we want to use logistic regression to answer this question. As you can see in the chart below, the link function for logistic regression is "binomial".
___
![Regression guide](https://image.ibb.co/ducqSw/regression_guide.png)
___

We can fit and plot a regression line to our data using the geom_smooth() layer. To make sure we do logistic regression, we need to make sure that we tell geom_smooth to fit a regression model from the binomial family, like so:
    
    geom_smooth(method = "glm", method.args = list(family = "binomial"))
 
 If we were fitting a linear regression model instead, for example, we'd ask geom_smooth() to fit a model from the gaussian family, like so:
    
    geom_smooth(method = "glm", method.args = list(family = "gaussian"))
    
geom_smooth() relies on the x and y arguments that you pass to ggplot (in the aes() argument), so you want to make sure you put the value you're interested in predicting in the y argument slot. (Here, that's "dessert".)

In [None]:
import pandas as pd
import numpy as np

from ggplot.geoms.geom import geom
from ggplot.stats import smoothers
from ggplot.utils import is_date

class stat_smooth(geom):
    """
    Smoothed line charts for inspecting trends in your data. There are 3 types of
    smoothing algorithms you can use:
        LOESS ('loess', 'lowess'): Non-parmetric, local regression technique for
            calculating a smoothed curve.
        linear model ('lm'): Fits a linear model to your (x, y) coordinates
        moving average ('ma'): Calculates average of last N points in (x, y) coordinates
    In addition to plotting the smoothed line, stat_smooth will also display the
    standard error bands of the smoothed data (controlled by se=True/False).
    Parameters
    ----------
    x:
        x values for (x, y) coordinates
    y:
        y values for (x, y) coordinates. these will ultimately be smoothed
    color:
        color of the outer line
    alpha:
        transparency of color
    size:
        thickness of line
    linetype:
        type of the line ('solid', 'dashed', 'dashdot', 'dotted')
    se:
        boolean value for whether or not to display standard error bands; defaults to True
    method:
        type of smoothing to ues ('loess', 'ma', 'lm')
    window:
        number of periods to include in moving average calculation
    Examples
    --------
    """

    DEFAULT_AES = {'color': 'black'}
    DEFAULT_PARAMS = {'geom': 'smooth', 'position': 'identity', 'method': 'auto',
            'se': True, 'n': 80, 'fullrange': False, 'level': 0.95,
            'span': 2/3., 'window': None}
    REQUIRED_AES = {'x', 'y'}
    _aes_renames = {'size': 'linewidth', 'linetype': 'linestyle'}

    def plot(self, ax, data, _aes):
        (data, _aes) = self._update_data(data, _aes)
        variables = _aes.data
        data = data[list(variables.values())]
        data = data.dropna()
        x = data[variables['x']]
        y = data[variables['y']]

        params = {'alpha': 0.2}

        se = self.params.get('se', True)
        method = self.params.get('method', 'lm')
        level = self.params.get('level', 0.95)
        window = self.params.get('window', None)
        span = self.params.get('span', 2/3.)

        if method == "lm":
            x, y, y1, y2 = smoothers.lm(x, y, 1-level)
        elif method == "ma":
            x, y, y1, y2 = smoothers.mavg(x, y, window=window)
        else:
            x, y, y1, y2 = smoothers.lowess(x, y, span=span)

        smoothed_data = pd.DataFrame(dict(x=x, y=y, y1=y1, y2=y2))
        try:  # change in Pandas-0.19
            smoothed_data = smoothed_data.sort_values(by='x')
        except:  # before Pandas-0.19
            smoothed_data = smoothed_data.sort('x')

        params = self._get_plot_args(data, _aes)
        if 'alpha' not in params:
            params['alpha'] = 0.2

        order = np.argsort(x)
        if self.params.get('se', True)==True:
            if is_date(smoothed_data.x.iloc[0]):
                dtype = smoothed_data.x.iloc[0].__class__
                x = np.array([i.toordinal() for i in smoothed_data.x])
                ax.fill_between(x, smoothed_data.y1, smoothed_data.y2, **params)
                new_ticks = [dtype(i) for i in ax.get_xticks()]
                ax.set_xticklabels(new_ticks)
            else:
                ax.fill_between(smoothed_data.x, smoothed_data.y1, smoothed_data.y2, **params)
        if self.params.get('fit', True)==True:
            del params['alpha']
            ax.plot(smoothed_data.x, smoothed_data.y, **params)


In [None]:
ggplot(recipes, aes(x='calories', y='dessert')) + geom_point() + \
stat_smooth(method="lm", color='blue')

In [None]:
sns.set(style="darkgrid")
g = sns.regplot(x="calories", y="dessert", data=recipes, logistic=True)
g.figure.set_size_inches(8, 8)

## Your turn!
___
![Regression guide](https://image.ibb.co/ducqSw/regression_guide.png)

A quick guide to the three types of regression we've talked about.
___

Now it's your turn to come up with a question, pick the right model for your data and plot it.

1. Pick one of the two datasets ("weather" or "bikes", your choice! You can find out more about these datasets by expanding the "Input" section at the very top of this notebook.)
2. Identify which variables are continuous, categorical and count using the dataset documentation. (You can also check out a summary of the dataset using summary() or str())
3. Pick a variable to predict and one varaible to use to predict it
    * For this challange, if you're picking a categorical value, I'd recommend choosing one with only two possible categories (like dessert or not dessert)
4. Plot your two variables
5. Use "geom_smooth" and the appropriate family to fit and plot a model
6. Optional: If you want to share your analysis with friends or to ask for help, you’ll need to make it public so that other people can see it.
  * Publish your kernel by hitting the big blue “publish” button. (This may take a second.)
  * Change the visibility to “public” by clicking on the blue “Make Public” text (right above the “Fork Notebook” button).
  * Tag your notebook with 5daychallenge

In [None]:
weather.describe()

In [None]:
print(weather)

In [None]:
weather.columns

In [None]:
weather.dtypes

In [None]:
weather['Precip Type'].value_counts()

In [None]:
weather.groupby('Precip Type')['Apparent Temperature (C)'].mean()

In [None]:
from sklearn.cross_validation import train_test_split
#split train data and test data
X_train,X_test,y_train,y_test = train_test_split \
    (weather.drop(['Apparent Temperature (C)','Summary','Daily Summary','Formatted Date'],axis=1) \
    ,weather['Apparent Temperature (C)'], test_size=0.25, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)