In [1]:
pip install -i https://test.pypi.org/simple/ lambdata-ryan==1.8.1

Looking in indexes: https://test.pypi.org/simple/
Collecting lambdata-ryan==1.8.1
  Downloading https://test-files.pythonhosted.org/packages/d6/6f/d807a77f4b4afb517ba26f299124626d530cb870f1e5dee593414160b1fa/lambdata_ryan-1.8.1-py3-none-any.whl (4.1 kB)
Installing collected packages: lambdata-ryan
Successfully installed lambdata-ryan-1.8.1
Note: you may need to restart the kernel to use updated packages.


In [10]:
from my_lambdata.my_script import start
import pandas as pd
import numpy as np

In [3]:
start()

In [4]:
pd.get_option('display.max_rows')

14

In [5]:
url = ('https://archive.ics.uci.edu/ml/'
...        'machine-learning-databases/abalone/abalone.data')
cols = ['sex', 'length', 'diam', 'height', 'weight', 'rings']
abalone = pd.read_csv(url, usecols=[0, 1, 2, 3, 4, 8], names=cols)

In [6]:
# test 1st func to see if only 14 rows are displayed 
abalone

Unnamed: 0,sex,length,diam,height,weight,rings
0,M,0.455,0.365,0.095,0.5140,15
1,M,0.350,0.265,0.090,0.2255,7
2,F,0.530,0.420,0.135,0.6770,9
3,M,0.440,0.365,0.125,0.5160,10
4,I,0.330,0.255,0.080,0.2050,7
...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,11
4173,M,0.590,0.440,0.135,0.9660,10
4174,M,0.600,0.475,0.205,1.1760,9
4175,F,0.625,0.485,0.150,1.0945,10


In [7]:
from my_lambdata.my_script import train_split

DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge/master/data/'

In [8]:
df = pd.read_csv(DATA_PATH+'apartments/renthop-nyc.csv')
assert df.shape == (49352, 34)

In [11]:
# Remove the most extreme 1% prices,
# the most extreme .1% latitudes, &
# the most extreme .1% longitudes
df = df[(df['price'] >= np.percentile(df['price'], 0.5)) & 
        (df['price'] <= np.percentile(df['price'], 99.5)) & 
        (df['latitude'] >= np.percentile(df['latitude'], 0.05)) & 
        (df['latitude'] < np.percentile(df['latitude'], 99.95)) &
        (df['longitude'] >= np.percentile(df['longitude'], 0.05)) & 
        (df['longitude'] <= np.percentile(df['longitude'], 99.95))]

# Do train/test split
# Use data from April & May 2016 to train
# Use data from June 2016 to test
df['created'] = pd.to_datetime(df['created'], infer_datetime_format=True)
cutoff = pd.to_datetime('2016-06-01')
train = df[df.created < cutoff]
test  = df[df.created >= cutoff]

# Wrangle train & test sets in the same way
def engineer_features(df):
    
    # Avoid SettingWithCopyWarning
    df = df.copy()
        
    # Does the apartment have a description?
    df['description'] = df['description'].str.strip().fillna('')
    df['has_description'] = df['description'] != ''

    # How long is the description?
    df['description_length'] = df['description'].str.len()

    # How many total perks does each apartment have?
    perk_cols = ['elevator', 'cats_allowed', 'hardwood_floors', 'dogs_allowed',
                 'doorman', 'dishwasher', 'no_fee', 'laundry_in_building',
                 'fitness_center', 'pre-war', 'laundry_in_unit', 'roof_deck',
                 'outdoor_space', 'dining_room', 'high_speed_internet', 'balcony',
                 'swimming_pool', 'new_construction', 'exclusive', 'terrace', 
                 'loft', 'garden_patio', 'common_outdoor_space', 
                 'wheelchair_access']
    df['perk_count'] = df[perk_cols].sum(axis=1)

    # Are cats or dogs allowed?
    df['cats_or_dogs'] = (df['cats_allowed']==1) | (df['dogs_allowed']==1)

    # Are cats and dogs allowed?
    df['cats_and_dogs'] = (df['cats_allowed']==1) & (df['dogs_allowed']==1)

    # Total number of rooms (beds + baths)
    df['rooms'] = df['bedrooms'] + df['bathrooms']
    
    # Extract number of days elapsed in year, and drop original date feature
    df['days'] = (df['created'] - pd.to_datetime('2016-01-01')).dt.days
    df = df.drop(columns='created')

    return df

In [12]:
train_split(df)

(       bathrooms  bedrooms             created               description    display_address  latitude  longitude  price         street_address interest_level  elevator  cats_allowed  hardwood_floors  dogs_allowed  doorman  dishwasher  no_fee  laundry_in_building  fitness_center  pre-war  laundry_in_unit  roof_deck  outdoor_space  dining_room  high_speed_internet  balcony  swimming_pool  new_construction  terrace  exclusive  loft  garden_patio  wheelchair_access  common_outdoor_space
 28645        1.0         1 2016-05-28 02:42:48  The UnitThis is a sou...   West 54th Street   40.7678   -73.9915   4697   550 West 54th Street            low         1             1                0             1        1           0       0                    0               0        0                1          0              0            0                    0        0              0                 0        0          0     0             0                  0                     0
 14640        1.0     

In [13]:
train.shape, test.shape

((31844, 34), (16973, 34))