In [1]:
import pandas as pd
import os

# Directory containing the data files
data_directory = '/Users/sirvartsarafian/Bikes/Data/All Data'

# List all files in the directory ending with '.csv'
data_files = [file for file in os.listdir(data_directory) if file.endswith('.csv')]

# Create empty lists to store NaN counts, DataFrames for each CSV file, and datetime conversion results
NaN_counts_per_file = []
dfs = []
datetime_conversion_results = []

# Define common column names as they vary in CSVs
common_columns = ['Trip Duration', 'Start Time', 'Stop Time', 'Start Station ID', 
                  'Start Station Name', 'Start Station Latitude', 'Start Station Longitude', 
                  'End Station ID', 'End Station Name', 'End Station Latitude', 
                  'End Station Longitude', 'Bike ID', 'User Type', 'Birth Year', 'Gender']

# Loop through each data file and read it into a DataFrame
for data_file in data_files:
    # Read the CSV file into a DataFrame
    df = pd.read_csv(os.path.join(data_directory, data_file))
    
    # Rename columns to the common set of column names
    df.columns = common_columns
    
    # Convert 'Start Time' and 'Stop Time' columns to datetime format
    df['Start Time'] = pd.to_datetime(df['Start Time'], errors='coerce')
    df['Stop Time'] = pd.to_datetime(df['Stop Time'], errors='coerce')
    
    # Check if datetime conversion was successful
    conversion_result = df[['Start Time', 'Stop Time']].dtypes
    datetime_conversion_results.append(conversion_result)
    
    # Count NaN values in each column
    NaN_counts = df.isna().sum()
    NaN_counts_per_file.append(NaN_counts)
    
    # Append the DataFrame to the list
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
data = pd.concat(dfs, ignore_index=True)

# Count NaN values in each column of the concatenated DataFrame
total_NaN_counts = data.isna().sum()

# Display NaN counts for each column in each file
for i, data_file in enumerate(data_files):
    print(f"NaN counts for {data_file}:")
    print(NaN_counts_per_file[i])
    print()

# Display datetime conversion results for each file
for i, data_file in enumerate(data_files):
    print(f"Datetime conversion results for {data_file}:")
    print(datetime_conversion_results[i])
    print()

# Display total NaN counts for each column in the concatenated DataFrame
print("Total NaN counts after concatenation:")
print(total_NaN_counts)

NaN counts for JC-201701-citibike-tripdata.csv:
Trip Duration                0
Start Time                   0
Stop Time                    0
Start Station ID             0
Start Station Name           0
Start Station Latitude       0
Start Station Longitude      0
End Station ID               0
End Station Name             0
End Station Latitude         0
End Station Longitude        0
Bike ID                      0
User Type                   18
Birth Year                 386
Gender                       0
dtype: int64

NaN counts for JC-201902-citibike-tripdata.csv:
Trip Duration              0
Start Time                 0
Stop Time                  0
Start Station ID           0
Start Station Name         0
Start Station Latitude     0
Start Station Longitude    0
End Station ID             0
End Station Name           0
End Station Latitude       0
End Station Longitude      0
Bike ID                    0
User Type                  0
Birth Year                 0
Gender             

In [2]:
# Add extra columns for Year, Month, Day of Week, Date, Day of Week Name and Hour data belongs so
data['Year'] = data['Start Time'].dt.year
data['Month'] = data['Start Time'].dt.month
data['Day_of_Week'] = data['Start Time'].dt.dayofweek
data['Date'] = data['Start Time'].dt.date
data['Day of Week Name'] = data['Start Time'].dt.strftime('%A')
data['Hour'] = data['Start Time'].dt.hour



In [3]:
NaNs_exist = data.isna().any()

# Displaying columns containing NaT values
if NaNs_exist.any():
    print("Columns with NaT values:")
    print(NaNs_exist[NaNs_exist])
else:
    print("No NaT values found in the DataFrame.")

Columns with NaT values:
User Type     True
Birth Year    True
dtype: bool


In [4]:
# Define the file to save CSV file
output_file = '/Users/sirvartsarafian/Bikes/Data/all_data.csv'  

# Save the DataFrame to a CSV file
data.to_csv(output_file, index=False)  

In [5]:
data.head()

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,...,Bike ID,User Type,Birth Year,Gender,Year,Month,Day_of_Week,Date,Day of Week Name,Hour
0,148,2017-01-01 00:21:32,2017-01-01 00:24:01,3276,Marin Light Rail,40.714584,-74.042817,3185,City Hall,40.717732,...,24575,Subscriber,1983.0,1,2017,1,6,2017-01-01,Sunday,0
1,1283,2017-01-01 00:24:35,2017-01-01 00:45:58,3183,Exchange Place,40.716247,-74.033459,3198,Heights Elevator,40.748716,...,24723,Subscriber,1978.0,1,2017,1,6,2017-01-01,Sunday,0
2,372,2017-01-01 00:38:19,2017-01-01 00:44:31,3183,Exchange Place,40.716247,-74.033459,3211,Newark Ave,40.721525,...,24620,Subscriber,1989.0,1,2017,1,6,2017-01-01,Sunday,0
3,1513,2017-01-01 00:38:37,2017-01-01 01:03:50,3194,McGinley Square,40.72534,-74.067622,3271,Danforth Light Rail,40.69264,...,24668,Subscriber,1961.0,1,2017,1,6,2017-01-01,Sunday,0
4,639,2017-01-01 01:47:52,2017-01-01 01:58:31,3183,Exchange Place,40.716247,-74.033459,3203,Hamilton Park,40.727596,...,26167,Subscriber,1993.0,1,2017,1,6,2017-01-01,Sunday,1


In [6]:
data.tail()

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,...,Bike ID,User Type,Birth Year,Gender,Year,Month,Day_of_Week,Date,Day of Week Name,Hour
1354229,934,2017-06-30 23:50:05,2017-07-01 00:05:40,3211,Newark Ave,40.721525,-74.046305,3207,Oakland Ave,40.737604,...,26287,Subscriber,1991.0,1,2017,6,4,2017-06-30,Friday,23
1354230,908,2017-06-30 23:50:31,2017-07-01 00:05:39,3211,Newark Ave,40.721525,-74.046305,3207,Oakland Ave,40.737604,...,29472,Subscriber,1993.0,2,2017,6,4,2017-06-30,Friday,23
1354231,238,2017-06-30 23:52:16,2017-06-30 23:56:15,3203,Hamilton Park,40.727596,-74.044247,3211,Newark Ave,40.721525,...,29572,Subscriber,1992.0,1,2017,6,4,2017-06-30,Friday,23
1354232,184,2017-06-30 23:57:50,2017-07-01 00:00:54,3267,Morris Canal,40.712419,-74.038526,3183,Exchange Place,40.716247,...,29199,Subscriber,1985.0,1,2017,6,4,2017-06-30,Friday,23
1354233,463,2017-06-30 23:59:16,2017-07-01 00:07:00,3202,Newport PATH,40.727224,-74.033759,3203,Hamilton Park,40.727596,...,26289,Subscriber,1965.0,1,2017,6,4,2017-06-30,Friday,23


In [15]:
data['Hour'] = data['Start Time'].dt.hour


In [16]:
data.head()

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,...,Bike ID,User Type,Birth Year,Gender,Year,Month,Day_of_Week,Date,Day of Week Name,Hour
0,148,2017-01-01 00:21:32,2017-01-01 00:24:01,3276,Marin Light Rail,40.714584,-74.042817,3185,City Hall,40.717732,...,24575,Subscriber,1983.0,1,2017,1,6,2017-01-01,Sunday,0
1,1283,2017-01-01 00:24:35,2017-01-01 00:45:58,3183,Exchange Place,40.716247,-74.033459,3198,Heights Elevator,40.748716,...,24723,Subscriber,1978.0,1,2017,1,6,2017-01-01,Sunday,0
2,372,2017-01-01 00:38:19,2017-01-01 00:44:31,3183,Exchange Place,40.716247,-74.033459,3211,Newark Ave,40.721525,...,24620,Subscriber,1989.0,1,2017,1,6,2017-01-01,Sunday,0
3,1513,2017-01-01 00:38:37,2017-01-01 01:03:50,3194,McGinley Square,40.72534,-74.067622,3271,Danforth Light Rail,40.69264,...,24668,Subscriber,1961.0,1,2017,1,6,2017-01-01,Sunday,0
4,639,2017-01-01 01:47:52,2017-01-01 01:58:31,3183,Exchange Place,40.716247,-74.033459,3203,Hamilton Park,40.727596,...,26167,Subscriber,1993.0,1,2017,1,6,2017-01-01,Sunday,1
