In [85]:
## Build a custom Estimator ###
# Takes city + timestamp, and returns the average (by years) temp 

import os
from sklearn import base

class TemperatureEstimator(base.BaseEstimator, base.RegressorMixin, base.TransformerMixin):
    """
    """
#     __module__ = os.path.splitext(os.path.basename(__file__))[0]  ### look here ###
        
        ## CONSTANTS ###
    global PATH_DATA
    global PATH_PKL

    PATH_PKL = '/home/vagrant/miniprojects/questions/3_week/ts/pickles/'    
    PATH_DATA = '/home/vagrant/miniprojects/questions/3_week/ts/data/'
    PATH_DATA = '~/Dev/Data_Incubator/miniprojects/questions/3_week/ts/data/'


    
    def __init__(self):
        # initialization code
        self.ix = None  # a multiindex for easy hashing, set during transform
        self.model = None  # the trained model, set during fit
        return
    
    def load(self, file):
        """
        Loads training set from a txt file. \n
        Returns: A pandas DataFrame
        
        Parameters
        ---
        file: a path to a .txt file -- see PATH_DATA
        """
        ## Load Data ###
        file = PATH_DATA +'tiny_train2.txt'
        features = ['year', 'month', 'day', 'hour', 'temp', 'dew_temp', 'pressure', 'wind_angle', 'wind_speed', 'sky_code', 'rain_hour', 'rain_6hour', 'city',]
        data = pd.read_table(file, sep=r"\s*", names=features, header=None, encoding='latin-1', engine='python')

        df_data = data.iloc[:, [0,1,2,3,4,12]]
        
        return df_data
    
    def fit(self, X, y):
        """
        Trains a linear model that returns the average temp for a city,
        hour combination. \n
        Also saves trained model to PATH_PKL/
        
        Returns: A trained model (pandas df) \n
        
        Parameters \n
        ---
        X: a pandas df of training features \n
        y: a pandas df or series of training targets \n
        ---
        """
        from sklearn.externals import joblib
        
        df_data = X
        df_data['temp'] = y
        
        ## Aggregate by City ###
        gb_temps_by_city = df_data.groupby(by=['city'])  # separate records by city
        gb = gb_temps_by_city

        # Aggregate by mdh & find mean temp
        lst_city = []
        lst_mdh = []
        lst_avg_temp = []

        for k_city, df_years in gb_temps_by_city:
            gb_mdh = df_years.groupby('mdh') # aggregate annual measurements for each day+hour

            for k_mdh, df_annual_temps in gb_mdh:  # for each set of annual measurements
                avg_hourly_temp = df_annual_temps.temp.mean()
                lst_avg_temp.append(avg_hourly_temp) # avg hourly temp over years
                lst_city.append(k_city)
                lst_mdh.append(k_mdh)

        df_avg_temps['mdh'] = lst_mdh
        df_avg_temps['avg_temp'] = lst_avg_temp
        df_avg_temps['city'] = lst_city
        
        self.model = df_avg_temps
        ## At this point, model is trained.  It's a df of mdh, temp, city
#         joblib.dump(PATH_PKL + 'ts_q1.pkl')
        
        return df_avg_temps

    def fit_linear(self, X, y):
        """
        
        Returns:
        
        Parameters
        ---
        """
        
        return
    
    def fit_fourier(self, X, y)
    :
        """
        
        Returns:
        
        Parameters
        ---
        """
        
        return    
    
    def transform(self, record, features=None, target=None):
        """
        Transforms & cleans a pandas df  \n
        Adds MultiIndex('city','mdh') \n
        Returns: A tuple of pandas DataFrames (X,y)
        
        Parameters
        ---
        record: a pandas dataframe
        features: a list of feature names (X)
        target: the name of the target feature (y)
        """
        df_record = record
        
        ## Convert date nums to strings ###
        df_record['year'] = df_record['year'].astype(str)
        df_record['month'] = df_record['month'].astype(str)
        df_record['day'] = df_record['day'].astype(str)
        df_record['hour'] = df_record['hour'].astype(str)
        
        ## Replace leading zeroes ###
        for row in range(0, len(df_record)):
            for col in range(1,4):
                if len(df_record.iloc[row,col]) < 2: 
                        df_record.iloc[row,col] = ('0' + df_record.iloc[row,col])


        ## Join Date Strings
        d = df_record

        # join date strings for later timestamp conversion
        d['timestamp'] = d['year'].astype(str) + ',' + d['month'].astype(str) + ',' + d['day'].astype(str)+ ',' + d['hour'].astype(str)   

        # join dates without years for later sorting
        d['mdh'] = d['month'].astype(str) + d['day'].astype(str) + d['hour'].astype(str)

        ## Cast to timestamp ###
        d.timestamp = pd.to_datetime(d.timestamp)

        ## Subset df ###
        d = d.loc[:,['timestamp','temp', 'city', 'mdh']]
        
        
        ## Clean NaNs ###
        # An Exploratory Print shows some temp = -9999, equivalent to NaN
        # Replace with an average interpolation

        # Identify indices of temperature NaNs
        ix_nan = d[d.temp == -9999].index.tolist()

        # for each ~NaN, avg adjacent temps
        for ix_row in ix_nan:
            flt_avg_tmp = 0.5*(d.ix[ix_row - 1,'temp'] + d.ix[ix_row + 1,'temp'])
            d.ix[ix_row,'temp'] = flt_avg_tmp

        ## Set MultiIndex: ('city', 'mdh') ###    
        ix = pd.MultiIndex([d['city'],d['mdh']], names=['city','mdh'])
        d = d.set_index(ix)    
        self.multiIndex = ix
        
        # Split into X and y
        X = d.loc[:,features]
        y = d.loc[:,target]
        
        return tuple(X,y)

    def train_test_split(self, X, y, ratio):
        """
        Divides X and y into sequential training and test sets \n
        Returns: the tuple (X_trn, y_trn, X_tst, y_tst) \n
        
        Parameters
        ---
        X:
        y:
        ratio: The percent (0 to 1.0) of the set to use for training
        """
        
        
        trn = range(0,int(ratio * len(X)))
        
        tst = range(0,int((1-ratio) * len(X)))
        
        X_trn = X[trn]
        y_trn = y[trn]
        X_tst = X[tst]
        y_tst = y[tst]
        
        return (X_trn, y_trn, X_tst, y_tst)

    def fit_transform(self, X, y):
        """
        
        Returns:
        
        Parameters
        ---
        """

        return


    def predict(self, X):
        """
        
        Returns: a list of y predictions
        
        Parameters
        ---
        X: a pandas df of features
        """
        X = transform(X) # assume X is exactly 1 row
        model = self.model
        y_pred = []
        
        lst_mdh = list(X.mdh)
        
        # subset by city
        for mdh in lst_mdh:
            temp = modle

        return

    def predict_fourier(self, X, y):
        """
        
        Returns:
        
        Parameters
        ---
        """

        return

    def score(self, X, y):
        """
        
        Returns:
        
        Parameters
        ---
        """

        return
    

    def predict(self, X):
        df = self.hyp
        y_pred = []

        for row in X:
            stars = df.loc[df['cities']==row,'avgstars']
            y_pred.append(stars)

        self.pred = y_pred
        return self.pred # prediction

In [42]:
import pandas
df = pandas.DataFrame([[1, 1.0], [2,3],[4,5]], columns=['x', 'y'])

print df, '\n-------------------'

# print list(df.iterrows())[1]
# print list(df.iterrows())[1][1]
# (list(df.iterrows())[1][1]['y'])

for row in df.iterrows():
    print row[1]['x']

type(list(df.iterrows())[1][1]['y'])
# print(row['y'].dtype)

# print(df['y'].dtype)


   x  y
0  1  1
1  2  3
2  4  5 
-------------------
1.0
2.0
4.0


numpy.float64

In [63]:
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(8, 4))
df['first'] = ['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']
df['second'] = ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']
df

Unnamed: 0,0,1,2,3,first,second
0,-1.030953,1.171499,-0.140037,-1.898345,bar,one
1,-0.011493,-0.690017,0.294604,-2.224048,bar,two
2,-0.859591,0.391188,1.270933,1.161132,baz,one
3,1.779973,-0.881454,-0.861854,1.922961,baz,two
4,-0.482403,-1.088713,1.275432,0.269679,foo,one
5,-0.301737,1.365313,-1.732238,1.07783,foo,two
6,-0.824878,-1.136114,0.448508,0.396625,qux,one
7,1.054949,0.387711,-0.082592,-2.550659,qux,two


In [73]:

ix = pd.MultiIndex.from_arrays([df['first'], df['second']], names=['first','second'])
df.set_index(ix)

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,first,second
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bar,one,-1.030953,1.171499,-0.140037,-1.898345,bar,one
bar,two,-0.011493,-0.690017,0.294604,-2.224048,bar,two
baz,one,-0.859591,0.391188,1.270933,1.161132,baz,one
baz,two,1.779973,-0.881454,-0.861854,1.922961,baz,two
foo,one,-0.482403,-1.088713,1.275432,0.269679,foo,one
foo,two,-0.301737,1.365313,-1.732238,1.07783,foo,two
qux,one,-0.824878,-1.136114,0.448508,0.396625,qux,one
qux,two,1.054949,0.387711,-0.082592,-2.550659,qux,two


In [80]:
df.iloc[0:8,:]

Unnamed: 0,0,1,2,3,first,second
0,-1.030953,1.171499,-0.140037,-1.898345,bar,one
1,-0.011493,-0.690017,0.294604,-2.224048,bar,two
2,-0.859591,0.391188,1.270933,1.161132,baz,one
3,1.779973,-0.881454,-0.861854,1.922961,baz,two
4,-0.482403,-1.088713,1.275432,0.269679,foo,one
5,-0.301737,1.365313,-1.732238,1.07783,foo,two
6,-0.824878,-1.136114,0.448508,0.396625,qux,one
7,1.054949,0.387711,-0.082592,-2.550659,qux,two
