In [21]:
import pandas as pd
import glob
import os
import datetime

In [4]:
dir_2015 = '2015-citibike-tripdata'

In [5]:
csv_files = glob.glob(os.path.join(dir_2015, '**', '*.csv'), recursive=True)

In [6]:
# Initialize an empty list to store DataFrames
dfs = []

# Loop through the list of CSV files and read each one into a DataFrame
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    df_sample = df.sample(100)
    dfs.append(df_sample)

In [7]:

# Concatenate all DataFrames into a single DataFrame
combined_15_df = pd.concat(dfs, ignore_index=True)

# Optionally, save the combined DataFrame to a new CSV file
combined_15_df.to_csv('combined_2015_data.csv', index=False)

# Display the combined DataFrame info
print(combined_15_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   tripduration             1200 non-null   int64  
 1   starttime                1200 non-null   object 
 2   stoptime                 1200 non-null   object 
 3   start station id         1200 non-null   int64  
 4   start station name       1200 non-null   object 
 5   start station latitude   1200 non-null   float64
 6   start station longitude  1200 non-null   float64
 7   end station id           1200 non-null   int64  
 8   end station name         1200 non-null   object 
 9   end station latitude     1200 non-null   float64
 10  end station longitude    1200 non-null   float64
 11  bikeid                   1200 non-null   int64  
 12  usertype                 1200 non-null   object 
 13  birth year               1084 non-null   float64
 14  gender                  

In [8]:
combined_15_df.tail()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
1195,609,5/9/2015 11:05:20,5/9/2015 11:15:30,530,11 Ave & W 59 St,40.771522,-73.990541,2006,Central Park S & 6 Ave,40.765909,-73.976342,18122,Subscriber,1962.0,2
1196,557,5/22/2015 07:47:11,5/22/2015 07:56:29,254,W 11 St & 6 Ave,40.735324,-73.998004,545,E 23 St & 1 Ave,40.736502,-73.978095,19209,Subscriber,1987.0,1
1197,283,5/5/2015 16:16:26,5/5/2015 16:21:09,345,W 13 St & 6 Ave,40.736494,-73.997044,336,Sullivan St & Washington Sq,40.730477,-73.999061,15236,Subscriber,1994.0,1
1198,1263,5/15/2015 17:26:58,5/15/2015 17:48:01,238,Bank St & Washington St,40.736197,-74.008592,450,W 49 St & 8 Ave,40.762272,-73.987882,18610,Subscriber,1985.0,1
1199,490,5/29/2015 20:05:49,5/29/2015 20:13:59,482,W 15 St & 7 Ave,40.739355,-73.999318,521,8 Ave & W 31 St,40.750967,-73.994442,18592,Subscriber,1956.0,1


In [9]:
combined_15_df.isnull().sum()

tripduration                 0
starttime                    0
stoptime                     0
start station id             0
start station name           0
start station latitude       0
start station longitude      0
end station id               0
end station name             0
end station latitude         0
end station longitude        0
bikeid                       0
usertype                     0
birth year                 116
gender                       0
dtype: int64

In [10]:
# Convert 'birth year' column to numeric, forcing non-numeric entries to NaN
combined_15_df['birth year'] = pd.to_numeric(combined_15_df['birth year'], errors='coerce')

# Fill NaN values with the mean of the column
median_birth_year = combined_15_df['birth year'].median()
combined_15_df['birth year'] = combined_15_df['birth year'].fillna(median_birth_year)

# Convert 'birth year' to integer type
combined_15_df['birth year'] = combined_15_df['birth year'].astype(int)

In [11]:
combined_15_df.isnull().sum()

tripduration               0
starttime                  0
stoptime                   0
start station id           0
start station name         0
start station latitude     0
start station longitude    0
end station id             0
end station name           0
end station latitude       0
end station longitude      0
bikeid                     0
usertype                   0
birth year                 0
gender                     0
dtype: int64

In [12]:
combined_15_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   tripduration             1200 non-null   int64  
 1   starttime                1200 non-null   object 
 2   stoptime                 1200 non-null   object 
 3   start station id         1200 non-null   int64  
 4   start station name       1200 non-null   object 
 5   start station latitude   1200 non-null   float64
 6   start station longitude  1200 non-null   float64
 7   end station id           1200 non-null   int64  
 8   end station name         1200 non-null   object 
 9   end station latitude     1200 non-null   float64
 10  end station longitude    1200 non-null   float64
 11  bikeid                   1200 non-null   int64  
 12  usertype                 1200 non-null   object 
 13  birth year               1200 non-null   int64  
 14  gender                  

In [22]:
combined_15_df.to_csv('combined_2015n_data.csv')

In [4]:
combined_15n_df = pd.read_csv('combined_2015n_data.csv')

In [5]:

combined_15n_df.head()

Unnamed: 0.1,Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,0,387,2015/04/22 7:20:24 PM,2015/04/22 7:26:51 PM,453,W 22 St & 8 Ave,40.744751,-73.999154,459,W 20 St & 11 Ave,40.746745,-74.007756,19905,Subscriber,1988,1
1,1,461,2015/04/12 5:08:13 PM,2015/04/12 5:15:54 PM,402,Broadway & E 22 St,40.740343,-73.989551,438,St Marks Pl & 1 Ave,40.727791,-73.985649,16623,Subscriber,1969,1
2,2,608,2015/04/30 12:13:22 PM,2015/04/30 12:23:30 PM,250,Lafayette St & Jersey St,40.724561,-73.995653,334,W 20 St & 7 Ave,40.742388,-73.997262,19587,Subscriber,1984,1
3,3,444,2015/04/14 4:37:46 PM,2015/04/14 4:45:10 PM,2023,E 55 St & Lexington Ave,40.759681,-73.970314,518,E 39 St & 2 Ave,40.747804,-73.973442,15512,Subscriber,1970,1
4,4,276,2015/04/16 2:58:39 PM,2015/04/16 3:03:16 PM,489,10 Ave & W 28 St,40.750664,-74.001768,509,9 Ave & W 22 St,40.745497,-74.001971,15636,Subscriber,1971,2


In [7]:
combined_15n_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               1200 non-null   int64  
 1   tripduration             1200 non-null   int64  
 2   starttime                1200 non-null   object 
 3   stoptime                 1200 non-null   object 
 4   start station id         1200 non-null   int64  
 5   start station name       1200 non-null   object 
 6   start station latitude   1200 non-null   float64
 7   start station longitude  1200 non-null   float64
 8   end station id           1200 non-null   int64  
 9   end station name         1200 non-null   object 
 10  end station latitude     1200 non-null   float64
 11  end station longitude    1200 non-null   float64
 12  bikeid                   1200 non-null   int64  
 13  usertype                 1200 non-null   object 
 14  birth year              

In [8]:

# # Convert 'starttime' and 'stoptime' to datetime
combined_15n_df['starttime'] = pd.to_datetime(combined_15n_df['starttime'])
combined_15n_df['stoptime'] = pd.to_datetime(combined_15n_df['stoptime'])


# Convert 'gender' to categorical if necessary
combined_15n_df['gender'] = combined_15n_df['gender'].astype('category')


  combined_15n_df['starttime'] = pd.to_datetime(combined_15n_df['starttime'])
  combined_15n_df['stoptime'] = pd.to_datetime(combined_15n_df['stoptime'])


In [9]:
combined_15n_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Unnamed: 0               1200 non-null   int64         
 1   tripduration             1200 non-null   int64         
 2   starttime                1200 non-null   datetime64[ns]
 3   stoptime                 1200 non-null   datetime64[ns]
 4   start station id         1200 non-null   int64         
 5   start station name       1200 non-null   object        
 6   start station latitude   1200 non-null   float64       
 7   start station longitude  1200 non-null   float64       
 8   end station id           1200 non-null   int64         
 9   end station name         1200 non-null   object        
 10  end station latitude     1200 non-null   float64       
 11  end station longitude    1200 non-null   float64       
 12  bikeid                   1200 non-

In [10]:
combined_15n_df.tail()

Unnamed: 0.1,Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
1195,1195,609,2015-05-09 11:05:20,2015-05-09 11:15:30,530,11 Ave & W 59 St,40.771522,-73.990541,2006,Central Park S & 6 Ave,40.765909,-73.976342,18122,Subscriber,1962,2
1196,1196,557,2015-05-22 07:47:11,2015-05-22 07:56:29,254,W 11 St & 6 Ave,40.735324,-73.998004,545,E 23 St & 1 Ave,40.736502,-73.978095,19209,Subscriber,1987,1
1197,1197,283,2015-05-05 16:16:26,2015-05-05 16:21:09,345,W 13 St & 6 Ave,40.736494,-73.997044,336,Sullivan St & Washington Sq,40.730477,-73.999061,15236,Subscriber,1994,1
1198,1198,1263,2015-05-15 17:26:58,2015-05-15 17:48:01,238,Bank St & Washington St,40.736197,-74.008592,450,W 49 St & 8 Ave,40.762272,-73.987882,18610,Subscriber,1985,1
1199,1199,490,2015-05-29 20:05:49,2015-05-29 20:13:59,482,W 15 St & 7 Ave,40.739355,-73.999318,521,8 Ave & W 31 St,40.750967,-73.994442,18592,Subscriber,1956,1


In [11]:
combined_15n_df.drop(columns=['Unnamed: 0'])

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,387,2015-04-22 19:20:24,2015-04-22 19:26:51,453,W 22 St & 8 Ave,40.744751,-73.999154,459,W 20 St & 11 Ave,40.746745,-74.007756,19905,Subscriber,1988,1
1,461,2015-04-12 17:08:13,2015-04-12 17:15:54,402,Broadway & E 22 St,40.740343,-73.989551,438,St Marks Pl & 1 Ave,40.727791,-73.985649,16623,Subscriber,1969,1
2,608,2015-04-30 12:13:22,2015-04-30 12:23:30,250,Lafayette St & Jersey St,40.724561,-73.995653,334,W 20 St & 7 Ave,40.742388,-73.997262,19587,Subscriber,1984,1
3,444,2015-04-14 16:37:46,2015-04-14 16:45:10,2023,E 55 St & Lexington Ave,40.759681,-73.970314,518,E 39 St & 2 Ave,40.747804,-73.973442,15512,Subscriber,1970,1
4,276,2015-04-16 14:58:39,2015-04-16 15:03:16,489,10 Ave & W 28 St,40.750664,-74.001768,509,9 Ave & W 22 St,40.745497,-74.001971,15636,Subscriber,1971,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,609,2015-05-09 11:05:20,2015-05-09 11:15:30,530,11 Ave & W 59 St,40.771522,-73.990541,2006,Central Park S & 6 Ave,40.765909,-73.976342,18122,Subscriber,1962,2
1196,557,2015-05-22 07:47:11,2015-05-22 07:56:29,254,W 11 St & 6 Ave,40.735324,-73.998004,545,E 23 St & 1 Ave,40.736502,-73.978095,19209,Subscriber,1987,1
1197,283,2015-05-05 16:16:26,2015-05-05 16:21:09,345,W 13 St & 6 Ave,40.736494,-73.997044,336,Sullivan St & Washington Sq,40.730477,-73.999061,15236,Subscriber,1994,1
1198,1263,2015-05-15 17:26:58,2015-05-15 17:48:01,238,Bank St & Washington St,40.736197,-74.008592,450,W 49 St & 8 Ave,40.762272,-73.987882,18610,Subscriber,1985,1


In [16]:
# Ensure 'starttime' and 'stoptime' columns are strings (if not already)
combined_15n_df['starttime'] = combined_15n_df['starttime'].astype(str)
combined_15n_df['stoptime'] = combined_15n_df['stoptime'].astype(str)

# Split 'starttime' into 'start_date' and 'start_time'
combined_15n_df[['start_date', 'start_time']] = combined_15n_df['starttime'].str.split(' ', expand=True)

# Split 'stoptime' into 'stop_date' and 'stop_time'
combined_15n_df[['stop_date', 'stop_time']] = combined_15n_df['stoptime'].str.split(' ', expand=True)

# Convert 'start_date' and 'stop_date' columns to datetime.date
combined_15n_df['start_date'] = pd.to_datetime(combined_15n_df['start_date']).dt.date
combined_15n_df['stop_date'] = pd.to_datetime(combined_15n_df['stop_date']).dt.date

# Convert 'start_time' and 'stop_time' columns to datetime.time
combined_15n_df['start_time'] = pd.to_datetime(combined_15n_df['start_time'], format='%H:%M:%S').dt.time
combined_15n_df['stop_time'] = pd.to_datetime(combined_15n_df['stop_time'], format='%H:%M:%S').dt.time

# Drop the original 'starttime' and 'stoptime' columns
combined_15n_df = combined_15n_df.drop(columns=['starttime', 'stoptime'])

# Check the result
print(combined_15n_df.dtypes)

Unnamed: 0                    int64
tripduration                  int64
start station id              int64
start station name           object
start station latitude      float64
start station longitude     float64
end station id                int64
end station name             object
end station latitude        float64
end station longitude       float64
bikeid                        int64
usertype                     object
birth year                    int64
gender                     category
start_date                   object
start_time                   object
stop_date                    object
stop_time                    object
dtype: object


In [23]:
# Convert 'start_date' and 'stop_date' columns to datetime
combined_15n_df['start_date'] = pd.to_datetime(combined_15n_df['start_date'], format='%Y-%m-%d')
combined_15n_df['stop_date'] = pd.to_datetime(combined_15n_df['stop_date'], format='%Y-%m-%d')

# Convert 'start_time' and 'stop_time' by combining with the date for full datetime conversion, then extract only the time part
combined_15n_df['start_time'] = pd.to_datetime(combined_15n_df['start_time'], format='%H:%M:%S').dt.time
combined_15n_df['stop_time'] = pd.to_datetime(combined_15n_df['stop_time'], format='%H:%M:%S').dt.time

# Check the datatypes to verify conversion
print(combined_15n_df.dtypes)


Unnamed: 0                          int64
tripduration                        int64
start station id                    int64
start station name                 object
start station latitude            float64
start station longitude           float64
end station id                      int64
end station name                   object
end station latitude              float64
end station longitude             float64
bikeid                              int64
usertype                           object
birth year                          int64
gender                           category
start_date                 datetime64[ns]
start_time                         object
stop_date                  datetime64[ns]
stop_time                          object
dtype: object


In [24]:
#  Save to CSV or Excel if needed for Tableau
combined_15n_df.to_csv('processed_2015_data.csv', index=False)