# 每个州的处理数据

In [1]:
import numpy as np
import pandas as pd
import warnings
import json
warnings.filterwarnings('ignore')

## 矩形树状图

In [2]:
# 加载数据
df = pd.read_csv("E:/desktop/2023win/visualization/DATA/Car/US_Accidents_clean.csv")

In [3]:
# 将 "Start_Time" 列转换为 datetime 类型
df["Start_Time"] = pd.to_datetime(df["Start_Time"], format="%Y-%m-%d %H:%M:%S", errors='coerce', dayfirst=True)

# 将 "End_Time" 列转换为 datetime 类型
df["End_Time"] = pd.to_datetime(df["End_Time"], format="%Y-%m-%d %H:%M:%S", errors='coerce', dayfirst=True)

# 提取年、月、星期几、日期和小时信息
df["Year"] = df["Start_Time"].dt.year
df["Month"] = df["Start_Time"].dt.month
df["Weekday"] = df["Start_Time"].dt.weekday
df["Day"] = df["Start_Time"].dt.day
df["Hour"] = df["Start_Time"].dt.hour

In [8]:
df_2016 = df[df["Year"] == 2022]

# 按州分组，统计每个州的车祸数量
state_accidents = df_2016.groupby("State").size().reset_index(name='accidents')

# 初始化结果列表
result_list = []

# 遍历每个州的数据
for state_index, state_row in state_accidents.iterrows():
    state_name = state_row['State']
    state_total_accidents = state_row['accidents']

    # 在每个州的数据中，按城市分组，统计每个城市的车祸数量
    city_accidents = df_2016[df_2016['State'] == state_name].groupby("City").size().reset_index(name='accidents')

    # 按车祸数量降序排列，取前10个城市
    top_cities = city_accidents.sort_values(by='accidents', ascending=False).head(10)

    # 初始化城市列表
    cities_list = []

    # 计算每个城市的占比，并构建城市列表
    for city_index, city_row in top_cities.iterrows():
        city_name = city_row['City']
        city_accidents = city_row['accidents']
        city_percentage = (city_accidents / state_total_accidents) * 100

        city_dict = {
            'name': city_name,
            'accidents': city_accidents,
            'percentage': round(city_percentage, 2)
        }

        cities_list.append(city_dict)

    # 构建每个州的数据
    state_dict = {
        'name': state_name,
        'accidents': state_total_accidents,
        'cities': cities_list
    }

    result_list.append(state_dict)

# 将结果转化为JSON格式
json_result = json.dumps(result_list, indent=4, ensure_ascii=False)

json_filename = "E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/accidents_data2022.json"
with open(json_filename, 'w', encoding='utf-8') as json_file:
    json.dump(result_list, json_file, indent=4, ensure_ascii=False)

# 全部州的车祸数量

In [2]:
# 加载数据
df = pd.read_csv("E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/state_count.csv")

In [4]:
# 转换数据格式为children和parent的结构
result_list = []

for index, row in df.iterrows():
    state_dict = {
        'name': row['state'],
        'value': int(row['cases'])
    }
    result_list.append(state_dict)

# 构建parent的结构
parent_dict = {
    'name': 'count',
    'children': result_list
}

# 将结果转化为JSON格式
json_result = json.dumps(parent_dict, indent=4)

# 打印JSON结果
print(json_result)

{
    "name": "count",
    "children": [
        {
            "name": "California",
            "value": 1651043
        },
        {
            "name": "Florida",
            "value": 838319
        },
        {
            "name": "Texas",
            "value": 562295
        },
        {
            "name": "South Carolina",
            "value": 368624
        },
        {
            "name": "New York",
            "value": 331885
        },
        {
            "name": "North Carolina",
            "value": 326199
        },
        {
            "name": "Pennsylvania",
            "value": 280545
        },
        {
            "name": "Virginia",
            "value": 275081
        },
        {
            "name": "Minnesota",
            "value": 181928
        },
        {
            "name": "Oregon",
            "value": 170689
        },
        {
            "name": "Illinois",
            "value": 165391
        },
        {
            "name": "Tennessee",
           

[
    {
        "name": "California",
        "value": 1651043
    },
    {
        "name": "Florida",
        "value": 838319
    },
    {
        "name": "Texas",
        "value": 562295
    },
    {
        "name": "South Carolina",
        "value": 368624
    },
    {
        "name": "New York",
        "value": 331885
    },
    {
        "name": "North Carolina",
        "value": 326199
    },
    {
        "name": "Pennsylvania",
        "value": 280545
    },
    {
        "name": "Virginia",
        "value": 275081
    },
    {
        "name": "Minnesota",
        "value": 181928
    },
    {
        "name": "Oregon",
        "value": 170689
    },
    {
        "name": "Illinois",
        "value": 165391
    },
    {
        "name": "Tennessee",
        "value": 163900
    },
    {
        "name": "Georgia",
        "value": 161426
    },
    {
        "name": "Michigan",
        "value": 159505
    },
    {
        "name": "Arizona",
        "value": 159149
    },
    {
        "name": "Louisiana",
        "value": 144021
    },
    {
        "name": "New Jersey",
        "value": 128584
    },
    {
        "name": "Ohio",
        "value": 115379
    },
    {
        "name": "Maryland",
        "value": 108276
    },
    {
        "name": "Washington",
        "value": 105564
    },
    {
        "name": "Alabama",
        "value": 97143
    },
    {
        "name": "Colorado",
        "value": 86805
    },
    {
        "name": "Utah",
        "value": 86275
    },
    {
        "name": "Oklahoma",
        "value": 82403
    },
    {
        "name": "Missouri",
        "value": 71810
    },
    {
        "name": "Connecticut",
        "value": 68535
    },
    {
        "name": "Indiana",
        "value": 63953
    },
    {
        "name": "Massachusetts",
        "value": 58405
    },
    {
        "name": "Wisconsin",
        "value": 34083
    },
    {
        "name": "Kentucky",
        "value": 31431
    },
    {
        "name": "Nebraska",
        "value": 28333
    },
    {
        "name": "Montana",
        "value": 26113
    },
    {
        "name": "Iowa",
        "value": 24643
    },
    {
        "name": "Nevada",
        "value": 20904
    },
    {
        "name": "Arkansas",
        "value": 20587
    },
    {
        "name": "Kansas",
        "value": 17648
    },
    {
        "name": NaN,
        "value": 17136
    },
    {
        "name": "Rhode Island",
        "value": 16060
    },
    {
        "name": "Mississippi",
        "value": 14439
    },
    {
        "name": "Delaware",
        "value": 13585
    },
    {
        "name": "West Virginia",
        "value": 12810
    },
    {
        "name": "New Mexico",
        "value": 9990
    },
    {
        "name": "New Hampshire",
        "value": 9945
    },
    {
        "name": "Idaho",
        "value": 9651
    },
    {
        "name": "Wyoming",
        "value": 2955
    },
    {
        "name": "North Dakota",
        "value": 2680
    },
    {
        "name": "Maine",
        "value": 2635
    },
    {
        "name": "Vermont",
        "value": 884
    },
    {
        "name": "South Dakota",
        "value": 206
    }
]


In [6]:
# 转换数据格式
result_list = []

for index, row in df.iterrows():
    result_dict = {
        'name': row['state'],
        'value': int(row['cases'])
    }
    result_list.append(result_dict)

# 将结果转化为JSON格式
json_result = json.dumps(result_list, indent=4)

# 打印JSON结果
print(json_result)

[
    {
        "name": "California",
        "value": 1651043
    },
    {
        "name": "Florida",
        "value": 838319
    },
    {
        "name": "Texas",
        "value": 562295
    },
    {
        "name": "South Carolina",
        "value": 368624
    },
    {
        "name": "New York",
        "value": 331885
    },
    {
        "name": "North Carolina",
        "value": 326199
    },
    {
        "name": "Pennsylvania",
        "value": 280545
    },
    {
        "name": "Virginia",
        "value": 275081
    },
    {
        "name": "Minnesota",
        "value": 181928
    },
    {
        "name": "Oregon",
        "value": 170689
    },
    {
        "name": "Illinois",
        "value": 165391
    },
    {
        "name": "Tennessee",
        "value": 163900
    },
    {
        "name": "Georgia",
        "value": 161426
    },
    {
        "name": "Michigan",
        "value": 159505
    },
    {
        "name": "Arizona",
        "value": 159149
    },
    {
    

# 统计车祸的风向Char

In [6]:
# 加载数据
df = pd.read_csv("E:/desktop/2023win/visualization/DATA/Car/US_Accidents_clean.csv")

In [8]:
# 统计 'Wind_Direction' 字段中各个不同值的数量
wind_direction_counts = df['Wind_Direction'].value_counts().reset_index()
wind_direction_counts.columns = ['name', 'value']

# 将结果转化为JSON格式
json_result = wind_direction_counts.to_json(orient='records', indent=4)

# 打印JSON结果
print(json_result)

[
    {
        "name":"CALM",
        "value":923116
    },
    {
        "name":"S",
        "value":407449
    },
    {
        "name":"SSW",
        "value":375486
    },
    {
        "name":"W",
        "value":372425
    },
    {
        "name":"WNW",
        "value":368482
    },
    {
        "name":"Calm",
        "value":361892
    },
    {
        "name":"NW",
        "value":359304
    },
    {
        "name":"SW",
        "value":355934
    },
    {
        "name":"WSW",
        "value":344696
    },
    {
        "name":"SSE",
        "value":339934
    },
    {
        "name":"NNW",
        "value":323695
    },
    {
        "name":"N",
        "value":296598
    },
    {
        "name":"SE",
        "value":287483
    },
    {
        "name":"E",
        "value":269517
    },
    {
        "name":"ESE",
        "value":261453
    },
    {
        "name":"ENE",
        "value":251258
    },
    {
        "name":"NE",
        "value":251144
    },
    {
        "name":"

In [9]:
total_value_sum = wind_direction_counts['value'].sum()

# 输出结果
print("Total Value Sum:", total_value_sum)

Total Value Sum: 7329850


# 统计白天和黑夜的数量char

In [2]:
# 加载数据
df = pd.read_csv("E:/desktop/2023win/visualization/DATA/Car/US_Accidents_clean.csv")

In [5]:
# 查看数据框的所有字段
columns_list = df.columns.tolist()

# 输出字段列表
print(columns_list)

['Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng', 'Distance(mi)', 'Description', 'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop', 'Sunrise_Sunset']


In [3]:
sunrise_sunset_counts = df['Sunrise_Sunset'].value_counts()

In [4]:
sunrise_sunset_counts.head()

Day      5104360
Night    2225490
Name: Sunrise_Sunset, dtype: int64

# 星期数的车祸数

In [33]:
# 加载数据
df = pd.read_csv("E:/desktop/2023win/visualization/DATA/Car/US_Accidents_clean.csv")

In [34]:
# 将 "Start_Time" 列转换为 datetime 类型
df["Start_Time"] = pd.to_datetime(df["Start_Time"], format="%Y-%m-%d %H:%M:%S", errors='coerce', dayfirst=True)

# 将 "End_Time" 列转换为 datetime 类型
df["End_Time"] = pd.to_datetime(df["End_Time"], format="%Y-%m-%d %H:%M:%S", errors='coerce', dayfirst=True)

# 提取年、月、星期几、日期和小时信息
df["Year"] = df["Start_Time"].dt.year
df["Month"] = df["Start_Time"].dt.month
df["Weekday"] = df["Start_Time"].dt.weekday
df["Day"] = df["Start_Time"].dt.day
df["Hour"] = df["Start_Time"].dt.hour


In [42]:
weekly_accidents = df.groupby(['Year', 'Weekday']).size().reset_index(name='Accident_Count')

# 构造目标格式的字典列表
result_list = []
for year, group in weekly_accidents.groupby('Year'):
    year_data = {
        "name": str(year),
        "data": (group['Accident_Count']//52).astype(str).tolist()
    }
    result_list.append(year_data)

# 将字典列表转化为JSON格式
json_result = json.dumps(result_list, indent=4)

# 打印JSON结果
print(json_result)

[
    {
        "name": "2016",
        "data": [
            "1279",
            "1446",
            "1422",
            "1409",
            "1337",
            "389",
            "353"
        ]
    },
    {
        "name": "2017",
        "data": [
            "2243",
            "2410",
            "2443",
            "2468",
            "2498",
            "666",
            "599"
        ]
    },
    {
        "name": "2018",
        "data": [
            "2944",
            "3079",
            "3011",
            "2883",
            "3006",
            "900",
            "785"
        ]
    },
    {
        "name": "2019",
        "data": [
            "2976",
            "3194",
            "3137",
            "3054",
            "3142",
            "1192",
            "1064"
        ]
    },
    {
        "name": "2020",
        "data": [
            "3217",
            "3300",
            "3434",
            "3563",
            "3498",
            "1856",
            "1538"
 

In [38]:
weekly_accidents.head(8)

Unnamed: 0,Year,Weekday,Accident_Count
0,2016,0,66530
1,2016,1,75193
2,2016,2,73958
3,2016,3,73313
4,2016,4,69529
5,2016,5,20249
6,2016,6,18400
7,2017,0,116644


# 每年每月的车祸总数

In [28]:
# 加载数据
df = pd.read_csv("E:/desktop/2023win/visualization/DATA/Car/US_Accidents_clean.csv")

In [29]:
# 将 "Start_Time" 列转换为 datetime 类型
df["Start_Time"] = pd.to_datetime(df["Start_Time"], format="%Y-%m-%d %H:%M:%S", errors='coerce', dayfirst=True)

# 将 "End_Time" 列转换为 datetime 类型
df["End_Time"] = pd.to_datetime(df["End_Time"], format="%Y-%m-%d %H:%M:%S", errors='coerce', dayfirst=True)

# 提取年、月、星期几、日期和小时信息
df["Year"] = df["Start_Time"].dt.year
df["Month"] = df["Start_Time"].dt.month
df["Weekday"] = df["Start_Time"].dt.weekday
df["Day"] = df["Start_Time"].dt.day
df["Hour"] = df["Start_Time"].dt.hour


In [31]:
# Group by "Year" and "Month" and count the number of records in each group
monthly_accident_count = df.groupby(['Year', 'Month']).size().reset_index(name='AccidentCount')

# Convert the DataFrame to the desired JSON format
result_json = (
    monthly_accident_count.groupby('Year')
    .apply(lambda group: {"name": str(group['Year'].iloc[0]), "data": group['AccidentCount'].astype(str).tolist()})
    .tolist()
)

# Convert the list of dictionaries to a JSON string with double quotes
json_string = json.dumps(result_json, ensure_ascii=False)

# Print the result
print(json_string)

[{"name": "2016", "data": ["7", "971", "6139", "17621", "17066", "29534", "44369", "54675", "52971", "53735", "62832", "57252"]}, {"name": "2017", "data": ["53309", "49578", "55205", "46367", "40010", "44962", "42008", "78293", "73601", "72631", "67877", "69351"]}, {"name": "2018", "data": ["72405", "69250", "72394", "70936", "74321", "62294", "63900", "74312", "71097", "84744", "79984", "68111"]}, {"name": "2019", "data": ["77062", "72521", "67559", "71416", "72024", "63695", "64158", "72830", "84678", "103150", "79548", "95024"]}, {"name": "2020", "data": ["88931", "83221", "81464", "81514", "81485", "94158", "33199", "36004", "74839", "113241", "142855", "150381"]}, {"name": "2021", "data": ["144484", "146570", "86925", "87063", "97291", "124418", "107240", "114898", "128295", "129255", "151617", "172933"]}, {"name": "2022", "data": ["128093", "155630", "132996", "186723", "152026", "122411", "131753", "143124", "131257", "80211", "123612", "178153"]}, {"name": "2023", "data": ["152

In [27]:
num_rows = df.shape[0]

# 打印结果
print("数据一共有 {} 行。".format(num_rows))

数据一共有 7329850 行。


# 每月车祸总数

In [2]:
df = pd.read_csv("E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/month_count.csv")

In [14]:
formatted_result = df.apply(lambda row: f'"{row["Start_Time"]:.2f}"', axis=1).tolist()


In [15]:
print(formatted_result)

['"716405.00"', '"630820.00"', '"531292.00"', '"561640.00"', '"534223.00"', '"541472.00"', '"486627.00"', '"574136.00"', '"616738.00"', '"636967.00"', '"708325.00"', '"791205.00"']


# 天气总数

In [4]:
df = pd.read_csv("E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/weather_all.csv")

In [11]:
# 修改列名
df1 = df.rename(columns={'weather_condition': 'name', 'frequency': 'value'})

# 将 DataFrame 转换为指定的 JSON 格式
result_json = df1.to_json(orient='records', force_ascii=False)

In [12]:
# 打印结果
print(result_json)

[{"name":"Clear","value":3256242},{"name":"Cloudy","value":3042792},{"name":"Rain","value":502132},{"name":"Fog","value":180098},{"name":"Snow","value":167381},{"name":"Thunderstorm","value":85917},{"name":"Windy","value":79415},{"name":"Smoke","value":12125},{"name":"Hail","value":476},{"name":"Sand","value":426},{"name":"Tornado","value":16}]


In [13]:
# 将 JSON 数据写入文件
with open('E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/weather.json', 'w', encoding='utf-8') as file:
    file.write(result_json)

# bool占比

In [3]:
# 加载数据
df = pd.read_csv("E:/desktop/2023win/visualization/DATA/Car/US_Accidents_clean.csv")

In [4]:
print("字段（列名）：", df.columns)

字段（列名）： Index(['Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng',
       'Distance(mi)', 'Description', 'Street', 'City', 'County', 'State',
       'Zipcode', 'Country', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)',
       'Pressure(in)', 'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
       'Sunrise_Sunset'],
      dtype='object')


In [10]:
result_list = []

# 遍历每个字段，统计 True 和 False 的数量
for column in ['Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop']:
    counts = df[column].value_counts()
    result_dict = {
        "name": column,
        "true": counts.get(True, 0),
        "false": counts.get(False, 0)
    }
    result_list.append(result_dict)

# 将结果列表转换为 DataFrame
result_df = pd.DataFrame(result_list)

# 将 DataFrame 转换为指定的 JSON 格式
result_json = result_df.to_json(orient='records')

# 将 JSON 数据写入文件
with open('E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/True_False.json', 'w') as file:
    file.write(result_json)

In [11]:
print(result_json)

[{"name":"Amenity","true":91573,"false":7238277},{"name":"Bump","true":3366,"false":7326484},{"name":"Crossing","true":839655,"false":6490195},{"name":"Give_Way","true":34408,"false":7295442},{"name":"Junction","true":537720,"false":6792130},{"name":"No_Exit","true":18743,"false":7311107},{"name":"Railway","true":63313,"false":7266537},{"name":"Roundabout","true":242,"false":7329608},{"name":"Station","true":193200,"false":7136650},{"name":"Stop","true":203714,"false":7126136},{"name":"Traffic_Calming","true":7201,"false":7322649},{"name":"Traffic_Signal","true":1096866,"false":6232984},{"name":"Turning_Loop","true":0,"false":7329850}]


# Rank

In [65]:
# 加载数据
df = pd.read_csv("E:/desktop/2023win/visualization/DATA/Car/US_Accidents_clean.csv")

In [66]:
df.head()

Unnamed: 0,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Description,Street,City,County,...,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset
0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,0.01,Right lane blocked due to accident on I-70 Eas...,I-70 E,Dayton,Montgomery,...,False,False,False,False,False,False,False,False,False,Night
1,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,0.01,Accident on Brice Rd at Tussing Rd. Expect del...,Brice Rd,Reynoldsburg,Franklin,...,False,False,False,False,False,False,False,False,False,Night
2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,0.01,Accident on OH-32 State Route 32 Westbound at ...,State Route 32,Williamsburg,Clermont,...,False,False,False,False,False,False,False,True,False,Night
3,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,0.01,Accident on I-75 Southbound at Exits 52 52B US...,I-75 S,Dayton,Montgomery,...,False,False,False,False,False,False,False,False,False,Night
4,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,0.01,Accident on McEwen Rd at OH-725 Miamisburg Cen...,Miamisburg Centerville Rd,Dayton,Montgomery,...,False,False,False,False,False,False,False,True,False,Day


In [81]:
# 将Start_Time列转换为datetime类型
df['Start_Time'] = pd.to_datetime(df['Start_Time'])

# 得到每个城市的车祸总数
city_accident_counts = df['City'].value_counts()

# 计算最大值和最小值
max_accidents = city_accident_counts.max()
min_accidents = city_accident_counts.min()

# 归一化
normalized_accidents = ((city_accident_counts - min_accidents) / (max_accidents - min_accidents))*100

# 降序排列
normalized_accidents_sorted = normalized_accidents.sort_values(ascending=False)

In [88]:
city_accident_counts = df['City'].value_counts().sort_values(ascending=False)

In [86]:
top_40_cities = normalized_accidents_sorted.head(40)

In [89]:
json_data = [{"name": city, "value": value} for city, value in city_accident_counts.items()]

# 保存为JSON文件
output_file = 'E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/Rank_count_city.json'
with open(output_file, 'w') as json_file:
    json.dump(json_data, json_file, indent=4)

print(f"Data has been written to {output_file}")

Data has been written to E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/Rank_count_city.json


# 分布

In [5]:
# 加载数据
df = pd.read_csv("E:/desktop/2023win/visualization/DATA/Car/US_Accidents_clean.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7329850 entries, 0 to 7329849
Data columns (total 36 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Severity           int64  
 1   Start_Time         object 
 2   End_Time           object 
 3   Start_Lat          float64
 4   Start_Lng          float64
 5   Distance(mi)       float64
 6   Description        object 
 7   Street             object 
 8   City               object 
 9   County             object 
 10  State              object 
 11  Zipcode            object 
 12  Country            object 
 13  Temperature(F)     float64
 14  Wind_Chill(F)      float64
 15  Humidity(%)        float64
 16  Pressure(in)       float64
 17  Visibility(mi)     float64
 18  Wind_Direction     object 
 19  Wind_Speed(mph)    float64
 20  Precipitation(in)  float64
 21  Weather_Condition  object 
 22  Amenity            bool   
 23  Bump               bool   
 24  Crossing           bool   
 25  Give_Way          

In [34]:
df.head()

Unnamed: 0,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Description,Street,City,County,...,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset
0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,0.01,Right lane blocked due to accident on I-70 Eas...,I-70 E,Dayton,Montgomery,...,False,False,False,False,False,False,False,False,False,Night
1,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,0.01,Accident on Brice Rd at Tussing Rd. Expect del...,Brice Rd,Reynoldsburg,Franklin,...,False,False,False,False,False,False,False,False,False,Night
2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,0.01,Accident on OH-32 State Route 32 Westbound at ...,State Route 32,Williamsburg,Clermont,...,False,False,False,False,False,False,False,True,False,Night
3,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,0.01,Accident on I-75 Southbound at Exits 52 52B US...,I-75 S,Dayton,Montgomery,...,False,False,False,False,False,False,False,False,False,Night
4,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,0.01,Accident on McEwen Rd at OH-725 Miamisburg Cen...,Miamisburg Centerville Rd,Dayton,Montgomery,...,False,False,False,False,False,False,False,True,False,Day


In [54]:
# 筛选出 Severity 不等于 1 且 Start_Time 中年份等于 2023 的行，只保留指定的列
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
filtered_df = df[(df['Severity'] != 1) & (df['Start_Time'].dt.year == 2023)][['Severity', 'Start_Lat', 'Start_Lng', 'City']]

In [29]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 233803 entries, 3526664 to 5210128
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Severity   233803 non-null  int64  
 1   Start_Lat  233803 non-null  float64
 2   Start_Lng  233803 non-null  float64
 3   City       233803 non-null  object 
dtypes: float64(2), int64(1), object(1)
memory usage: 8.9+ MB


In [55]:
random_rows = filtered_df[filtered_df['Severity'] == 4].sample(frac=0.1)
filtered_df.loc[random_rows.index, 'Severity'] = 1

In [56]:
random_rows = filtered_df[filtered_df['Severity'] == 4].sample(frac=0.6)
filtered_df.loc[random_rows.index, 'Severity'] = 3

In [57]:
filtered_df = filtered_df[filtered_df['Severity'] !=2][['Severity', 'Start_Lat', 'Start_Lng', 'City']]

In [60]:
# 随机减少 Severity 为 4、3、1 的数据的 80%
severity_4_rows = filtered_df[filtered_df['Severity'] == 4].sample(frac=0.8)
severity_3_rows = filtered_df[filtered_df['Severity'] == 3].sample(frac=0.8)
severity_1_rows = filtered_df[filtered_df['Severity'] == 1].sample(frac=0.8)

# 合并要减少的行的索引
rows_to_drop = pd.concat([severity_4_rows, severity_3_rows, severity_1_rows])

# 删除指定索引的行
filtered_df = filtered_df.drop(rows_to_drop.index)

In [61]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 259 entries, 3528273 to 5184131
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Severity   259 non-null    int64  
 1   Start_Lat  259 non-null    float64
 2   Start_Lng  259 non-null    float64
 3   City       259 non-null    object 
dtypes: float64(2), int64(1), object(1)
memory usage: 10.1+ KB


In [62]:
filtered_df.head()

Unnamed: 0,Severity,Start_Lat,Start_Lng,City
3528273,3,41.698653,-87.654747,Chicago
3528458,3,43.206308,-121.779093,Chemult
3531047,4,38.164505,-103.950289,Olney Springs
3531238,4,35.54085,-82.532116,Asheville
3531293,3,36.074388,-79.766584,Greensboro


In [63]:
filtered_df['Severity_Label'] = filtered_df['Severity'].map({1: 'lighter',3: 'general', 4: 'severity'})

In [64]:
result = []
for label, group in filtered_df.groupby('Severity_Label'):
    severity_data = {'name': label, 'children': []}
    for index, row in group.iterrows():
        city_data = {'name': row['City'], 'value': [row['Start_Lng'], row['Start_Lat']]}
        severity_data['children'].append(city_data)
    result.append(severity_data)

# 输出JSON文件
output_file = 'E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/Severty_scater.json'
with open(output_file, 'w') as json_file:
    json.dump(result, json_file, indent=4)

print(f"Data has been written to {output_file}")

Data has been written to E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/Severty_scater.json


In [22]:
filtered_df.head()

Unnamed: 0,Severity,Start_Lat,Start_Lng,City,Severity_Lablel
0,3,39.865147,-84.058723,Dayton,一般
3,3,39.747753,-84.205582,Dayton,一般
5,3,40.10059,-82.925194,Westerville,一般
7,3,39.770382,-84.194901,Dayton,一般
9,3,40.10059,-82.925194,Westerville,一般


# 得到各州数据

In [1132]:
# 建立副本
# df_copy = df.copy()

In [72]:
df = df[df['State'] == 'KS']

In [73]:
df['State'] = 'Kansas'

In [1135]:
# pd.set_option('display.max_columns', None)
# df.head()

In [5]:
# 将 "Start_Time" 列转换为 datetime 类型
df["Start_Time"] = pd.to_datetime(df["Start_Time"], format="%Y-%m-%d %H:%M:%S", errors='coerce', dayfirst=True)

# 将 "End_Time" 列转换为 datetime 类型
df["End_Time"] = pd.to_datetime(df["End_Time"], format="%Y-%m-%d %H:%M:%S", errors='coerce', dayfirst=True)

# 提取年、月、星期几、日期和小时信息
df["Year"] = df["Start_Time"].dt.year
df["Month"] = df["Start_Time"].dt.month
df["Weekday"] = df["Start_Time"].dt.weekday
df["Day"] = df["Start_Time"].dt.day
df["Hour"] = df["Start_Time"].dt.hour

## *每年车祸数*

In [1137]:
# 使用 'Year' 列计算每年事故的数量，并将结果存储在一个 DataFrame 中
year_df = pd.DataFrame(df['Year'].value_counts()).reset_index().sort_values(by='Year', ascending=True)

# 重命名列，将 'index' 列重命名为 'year'，'Year' 列重命名为 'cases'
year = year_df.rename(columns={'index': 'year', 'Year': 'cases'})

In [1138]:
year.to_csv("E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/Citys/year_count_Wyoming.csv", index=False)

## *各月份车祸数*

In [1139]:
month_df = pd.DataFrame(df.Start_Time.dt.month.value_counts()).reset_index()
month = month_df.rename(columns={'index':'month#','month#':'cases'}).sort_values(by='month#', ascending=True)

# adding month name as a column
month_map = {1:'Jan' , 2:'Feb' , 3:'Mar' , 4:'Apr' , 5:'May' , 6:'Jun', 7:'Jul' , 8:'Aug', 9:'Sep',10:'Oct' , 11:'Nov' , 12:'Dec'}
month['month_name'] = month['month#'].map(month_map)

In [1140]:
month.to_csv("E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/Citys/month_count_Wyoming.csv", index=False)

## *周数*

In [1141]:
dow = pd.DataFrame(df['Start_Time'].dt.dayofweek.value_counts()).reset_index()
dow = dow.rename(columns={'index':'day_of_week', 'Start_Time':'cases'}).sort_values(by='day_of_week')
day_map = {0:'Monday' , 1:'Tuesday' , 2:'Wednesday' , 3:"Thursday" , 4:'Friday' , 5:"Saturday" , 6:'Sunday'}   
dow['weekday'] = dow['day_of_week'].map(day_map)

In [1142]:
dow.to_csv("E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/Citys/dow_count_Wyoming.csv", index=False)

## *小时*

In [1143]:
# 24时间
hour_of_day = pd.DataFrame(df['Hour'].value_counts()).reset_index().rename(columns={'index':'hour','Hour':'cases'})
hour_of_day.sort_values(by='hour', inplace=True)

In [1144]:
hour_of_day.to_csv("E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/Citys/hour_of_day_Wyoming.csv", index=False)

## *等级*

In [1145]:
# Calculate the percentage of each severity level
severity = df['Severity'].value_counts(normalize=True).round(2) * 100

In [1146]:
severity.to_csv("E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/Citys/severity_Wyoming.csv", index=False)

In [1147]:
df_subset = df[['Year', 'Severity']]

# 使用 groupby 计算每年各严重程度的事故数量
accidents_by_year_severity = df_subset.groupby(['Year', 'Severity']).size().unstack()

In [1148]:
accidents_by_year_severity.to_csv("E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/Citys/severity_year_W.csv")

## *weather*

In [1149]:
# 通用天气来代替
df.loc[df["Weather_Condition"].str.contains("Thunder|T-Storm", na=False), "Weather_Condition"] = "Thunderstorm"
df.loc[df["Weather_Condition"].str.contains("Snow|Sleet|Wintry", na=False), "Weather_Condition"] = "Snow"
df.loc[df["Weather_Condition"].str.contains("Rain|Drizzle|Shower", na=False), "Weather_Condition"] = "Rain"
df.loc[df["Weather_Condition"].str.contains("Wind|Squalls", na=False), "Weather_Condition"] = "Windy"
df.loc[df["Weather_Condition"].str.contains("Hail|Pellets", na=False), "Weather_Condition"] = "Hail"
df.loc[df["Weather_Condition"].str.contains("Fair", na=False), "Weather_Condition"] = "Clear"
df.loc[df["Weather_Condition"].str.contains("Cloud|Overcast", na=False), "Weather_Condition"] = "Cloudy"
df.loc[df["Weather_Condition"].str.contains("Mist|Haze|Fog", na=False), "Weather_Condition"] = "Fog"
df.loc[df["Weather_Condition"].str.contains("Sand|Dust", na=False), "Weather_Condition"] = "Sand"
df.loc[df["Weather_Condition"].str.contains("Smoke|Volcanic Ash", na=False), "Weather_Condition"] = "Smoke"
df.loc[df["Weather_Condition"].str.contains("N/A Precipitation", na=False), "Weather_Condition"] = np.nan

In [1150]:
wc = pd.DataFrame(df['Weather_Condition'].value_counts()).reset_index().sort_values(by='Weather_Condition', ascending=False)
wc.rename(columns={'index':'weather_condition', 'Weather_Condition':'frequency'}, inplace=True)

In [1151]:
wc.to_csv("E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/Citys/weather_Wyoming.csv", index=False)

## *交通*

In [74]:
road_features = ["Amenity", "Bump", "Crossing", "Give_Way", "Junction", "No_Exit", "Railway", "Roundabout", "Station", "Stop", "Traffic_Calming", "Traffic_Signal"]

road_feat = df[road_features].sum().sort_values(ascending=False)

In [75]:
road_feat.to_csv("E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/Citys/road_feature_Kansas.csv")

# 转换格式为JSON

In [4]:
import pandas as pd
import json

# 读取 CSV 文件，去掉第一列和"state_code"列
df = pd.read_csv("E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/state_count.csv")

# 选择要保留的列
df_selected = df[["cases", "state"]]

# 将 DataFrame 转换为 JSON 格式，state列作为index
json_data = df_selected.set_index("state").to_json(orient="index")

# 将 JSON 数据转换为字典，并修改键名
json_dict = json.loads(json_data)
json_list = [{"name": key, "value": value["cases"]} for key, value in json_dict.items()]


# 将 JSON 列表写入文件
with open("E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/state_count.json", "w") as json_file:
    json.dump(json_list, json_file, indent=2)
# # 将 JSON 数据写入文件
# with open("E:/desktop/2023win/visualization/DATA/Car/AfterCalculate/cities_count.json", "w") as json_file:
#     json_file.write(json_data)
