In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression  # for regression
from sklearn.ensemble import RandomForestClassifier  # for classification
from sklearn.metrics import mean_squared_error, accuracy_score

In [2]:
df = pd.read_csv("USA_Full.csv")

In [3]:
df.head()

Unnamed: 0,EVENT_ID_CNTY,EVENT_DATE,YEAR,TIME_PRECISION,DISORDER_TYPE,EVENT_TYPE,SUB_EVENT_TYPE,ACTOR1,ASSOC_ACTOR_1,INTER1,...,LOCATION,LATITUDE,LONGITUDE,GEO_PRECISION,SOURCE,SOURCE_SCALE,NOTES,FATALITIES,TAGS,TIMESTAMP
0,USA23310,01-January-2020,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Health Workers (United States),6,...,Cleveland,41.482,-81.67,1,WKYC Studios; Crowd Counting Consortium,Other-Subnational,"On 1 January 2020, an unknown number of people...",0,crowd size=no report,1612546518
1,USA23416,01-January-2020,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),,6,...,Rochester,43.155,-77.616,1,13WHAM ABC; Democrat and Chronicle; 10NBC,Subnational,"On 1 January 2020, people held a peace march o...",0,crowd size=no report,1612546518
2,USA23636,01-January-2020,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Government of the United States (2017-2021),6,...,Phoenix,33.449,-112.074,1,Count Love; Channel 12 (Mesa),Other-Subnational,"On 1 January 2020, a group of people gathered ...",0,crowd size=no report,1612546518
3,USA23414,01-January-2020,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),,6,...,New York - Manhattan,40.783,-73.966,1,CBS2 (New York),Subnational,"On 1 January 2020, protesters gathered at NYPD...",0,crowd size=no report,1612546519
4,USA23635,01-January-2020,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),BLM: Black Lives Matter,6,...,Oakland,37.804,-122.271,1,KTVU Fox2,Subnational,"On 1 January 2020, a group of people gathered ...",0,crowd size=no report,1612546519


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56008 entries, 0 to 56007
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   EVENT_ID_CNTY       56008 non-null  object 
 1   EVENT_DATE          56008 non-null  object 
 2   YEAR                56008 non-null  int64  
 3   TIME_PRECISION      56008 non-null  int64  
 4   DISORDER_TYPE       56008 non-null  object 
 5   EVENT_TYPE          56008 non-null  object 
 6   SUB_EVENT_TYPE      56008 non-null  object 
 7   ACTOR1              56008 non-null  object 
 8   ASSOC_ACTOR_1       41667 non-null  object 
 9   INTER1              56008 non-null  int64  
 10  ACTOR2              5506 non-null   object 
 11  ASSOC_ACTOR_2       2227 non-null   object 
 12  INTER2              56008 non-null  int64  
 13  INTERACTION         56008 non-null  int64  
 14  CIVILIAN_TARGETING  614 non-null    object 
 15  ISO                 56008 non-null  int64  
 16  REGI

In [5]:
df.describe()

Unnamed: 0,YEAR,TIME_PRECISION,INTER1,INTER2,INTERACTION,ISO,ADMIN3,LATITUDE,LONGITUDE,GEO_PRECISION,FATALITIES,TIMESTAMP
count,56008.0,56008.0,56008.0,56008.0,56008.0,56008.0,0.0,56008.0,56008.0,56008.0,56008.0,56008.0
mean,2021.018301,1.015212,5.816865,0.41203,57.015391,840.0,,38.361133,-92.193876,1.019354,0.006642,1647879000.0
std,1.022662,0.126557,0.751482,1.482548,10.76887,0.0,,5.105354,18.229624,0.162834,0.167936,28686730.0
min,2020.0,1.0,1.0,0.0,10.0,840.0,,19.443,-166.534,1.0,0.0,1612546000.0
25%,2020.0,1.0,6.0,0.0,60.0,840.0,,34.77,-106.651,1.0,0.0,1616538000.0
50%,2021.0,1.0,6.0,0.0,60.0,840.0,,39.196,-85.977,1.0,0.0,1645566000.0
75%,2022.0,1.0,6.0,0.0,60.0,840.0,,41.821,-77.045,1.0,0.0,1680634000.0
max,2023.0,3.0,8.0,8.0,80.0,840.0,,71.291,-67.461,3.0,22.0,1686677000.0


In [9]:
pd.options.display.max_colwidth = 3000

# Step One Converting Crowd Size and Removing Nans--
## Calling resulting datafram - df1

In [10]:
def extract_crowd_size(tags):
    # Check if tags is a string
    if isinstance(tags, str):
        # Split the tags string into individual tags
        tags_list = tags.split(';')
        
        # Iterate through the tags
        for tag in tags_list:
            tag = tag.strip()
            if tag.lower().startswith("crowd size"):
                return tag
    # if no value in tag column, fill with "no report"
    return 'crowd size=no report'

In [11]:
df['crowd_size'] = df['TAGS'].apply(extract_crowd_size)

In [12]:
df['crowd_size'].isna().sum()

0

In [13]:


# Define your custom mapping for the top 40 most frequently occurring values
custom_mapping = {
    'crowd size=no report': np.nan,
    'crowd size=hundreds': 200,
    'crowd size=dozens':24,
    'crowd size=more than 100': 110,
    'crowd size=about 100': 100,
    'crowd size=about 50': 50,              
    'crowd size=about 200': 200,             
    'crowd size=about 30': 30,               
    'crowd size=about 20': 20,               
    'crowd size=thousands':2000, 
    'crowd size=about a dozen': 12,        
    'crowd size=about 40':40,             
    'crowd size=over 100': 115,
    'crowd size=more than 50': 55,
    'crowd size=several hundred': 300,
    'crowd size=small': 10,   
    'crowd size=about 150': 150, 
    'crowd size=more than 200': 210,
    'crowd size=several dozen':48,
    'crowd size=about two dozen':24,
    'crowd size=about 60': 60,
    'crowd size=about 300': 300,
    'crowd size=a small group': 10,
    'crowd size=around 100': 100,
    'crowd size=more than a dozen':15,
    'crowd size=several': 5,
    'crowd size=nearly 100':95,
    'crowd size=a few dozen':36,
    'crowd size=about 75':75,
    'crowd size=about 25':25,
    'crowd size=around 50':50,
    'crowd size=more than 300':305,
    'crowd size=about 70':70,
    'crowd size=about 15': 15,
    'crowd size=around 200':200,
    'crowd size=a few hundred': 300,
    'crowd size=a group': np.nan,
    'crowd size=more than 30': 35,
    'crowd size=about 500': 500,
    'crowd size=more than 1000':1020,
    'crowd size=more than 150': 155,
    'crowd size=over 200': 205,
    'crowd size=nearly 200': 195,
    'crowd size=at least 100': 105,
    'crowd size=a handful': 5,
    'crowd size=over 50': 55,
    'crowd size=around 30': 30,
    'crowd size=about 80': 80,
    'crowd size=about 1000': 1000,
    'crowd size=about 250': 250,
    'crowd size=about 400': 400,
    'crowd size=over 1000': 1010,
}

# Function to apply the custom mapping
def map_crowd_size(crowd_size_string):
    return custom_mapping.get(crowd_size_string, np.nan)

# Applying the mapping to the crowd_size column
df['crowd_size_numerical'] = df['crowd_size'].apply(map_crowd_size)

# Display the DataFrame
print(df[['crowd_size', 'crowd_size_numerical']])


                  crowd_size  crowd_size_numerical
0       crowd size=no report                   NaN
1       crowd size=no report                   NaN
2       crowd size=no report                   NaN
3       crowd size=no report                   NaN
4       crowd size=no report                   NaN
...                      ...                   ...
56003      crowd size=dozens                  24.0
56004   crowd size=no report                   NaN
56005   crowd size=no report                   NaN
56006   crowd size=no report                   NaN
56007  crowd size=roughly 30                   NaN

[56008 rows x 2 columns]


In [16]:
# crowd_size_F = list(df['crowd_size'])
crowd_size_frequencies = df['crowd_size_numerical'].value_counts()

In [42]:
df1 = df[df['crowd_size_numerical'].notna()].copy(deep = True)

In [43]:
df1.shape

(18476, 34)

# Now doing the dataframe by imputing proportionally, without dropping the NaN's

In [93]:
#replace NaN's with unknown
df.crowd_size_numerical = df.crowd_size_numerical.fillna('unknown')

In [94]:
# Assume df is your DataFrame and 'crowd_size' is your column of interest
unknown_indices = df[df['crowd_size_numerical'] == 'unknown'].index

# Filter out unknown values
known_values_df = df[df['crowd_size_numerical'] != 'unknown']

# Get distribution of known values
value_distribution = known_values_df['crowd_size_numerical'].value_counts(normalize=True)

# Generate random values according to distribution
random_values = np.random.choice(value_distribution.index, 
                                 size=len(unknown_indices), 
                                 p=value_distribution.values)

# Impute these random values in the DataFrame
df.loc[unknown_indices, 'crowd_size_numerical'] = random_values

In [95]:
df['crowd_size_numerical'].value_counts()

200.0     12240
24.0      10707
100.0      3542
110.0      3166
50.0       2893
300.0      1912
30.0       1838
10.0       1505
20.0       1254
2000.0     1240
55.0       1233
12.0       1168
40.0       1099
115.0      1090
15.0        977
5.0         863
210.0       809
150.0       790
48.0        760
60.0        703
95.0        571
36.0        547
75.0        501
25.0        443
305.0       409
70.0        390
205.0       337
1020.0      336
500.0       329
35.0        329
155.0       321
195.0       289
105.0       286
80.0        276
1000.0      270
250.0       239
400.0       211
1010.0      135
Name: crowd_size_numerical, dtype: int64

# What counts as demonstration violence

In [21]:
df1['EVENT_TYPE'].value_counts()

Protests                  18071
Riots                       403
Strategic developments        2
Name: EVENT_TYPE, dtype: int64

In [27]:
df1.loc[df1['EVENT_TYPE']== 'Strategic developments']["NOTES"].head(30)

10705                                                                                                                                                         Non-violent activity: On 19 June 2020, about 100 people, many of whom were armed, gathered in Nancy (Kentucky) to protect monuments from anticipated demonstrators. No demonstration actually occurred.
12265    Non-violent activity: On 4 July 2020, about 200 people including members of the Pennsylvania Volunteer Militia, Boogaloo Boys, and Proud Boys gathered in Gettysburg (Pennsylvania) 'to protect the Civil War monuments and the nation's flag' after rumors of an 'antifa flag burning' spread on conservative information channels. No flag burners showed.
Name: NOTES, dtype: object

In [22]:
df1['DISORDER_TYPE'].value_counts()

Demonstrations                        18416
Political violence; Demonstrations       51
Political violence                        7
Strategic developments                    2
Name: DISORDER_TYPE, dtype: int64

In [28]:
df1.loc[df1['DISORDER_TYPE']== 'Political violence']["NOTES"].head(30)

2172                                                                                                                                                                                                                                                                                                                                                                                                                                                                          On 20 March 2020, about 300 inmates at the New Mexico Correctional Center in Grants (New Mexico) rioted after they were placed in lockdown in connection to the reported suicide of an inmate at the facility. According to reports, some of the inmates threw rocks at prison guards, causing damage and injuring one person, who was sent to the hospital. The prison guards used less-lethal bean bag rounds and paintball guns containing irritants to regain control.
2372                                                                           

In [23]:
df1['SUB_EVENT_TYPE'].value_counts()

Peaceful protest                      17600
Protest with intervention               420
Violent demonstration                   396
Excessive force against protesters       51
Mob violence                              7
Other                                     2
Name: SUB_EVENT_TYPE, dtype: int64

In [35]:
df1.loc[df1['SUB_EVENT_TYPE']== 'Other']["NOTES"].head(30)

10705                                                                                                                                                         Non-violent activity: On 19 June 2020, about 100 people, many of whom were armed, gathered in Nancy (Kentucky) to protect monuments from anticipated demonstrators. No demonstration actually occurred.
12265    Non-violent activity: On 4 July 2020, about 200 people including members of the Pennsylvania Volunteer Militia, Boogaloo Boys, and Proud Boys gathered in Gettysburg (Pennsylvania) 'to protect the Civil War monuments and the nation's flag' after rumors of an 'antifa flag burning' spread on conservative information channels. No flag burners showed.
Name: NOTES, dtype: object

In [44]:
# Creating my binary target
def map_target(category):
    # Non-violent categories
    non_violent = ['Peaceful protest', 'Other']
    
    # Check if the category is in the non_violent list
    if category in non_violent:
        return 0
    else:
        return 1

# Create new column 'is_violent' by applying the mapping function to the 'event_type' column
df1['is_violent'] = df1['SUB_EVENT_TYPE'].apply(map_target)

# Display the DataFrame
print(df1[['SUB_EVENT_TYPE', 'is_violent']])

         SUB_EVENT_TYPE  is_violent
8      Peaceful protest           0
9      Peaceful protest           0
10     Peaceful protest           0
11     Peaceful protest           0
17     Peaceful protest           0
...                 ...         ...
55970  Peaceful protest           0
55979  Peaceful protest           0
55985  Peaceful protest           0
55987  Peaceful protest           0
56003  Peaceful protest           0

[18476 rows x 2 columns]


In [45]:
df1['is_violent'].value_counts()

0    17602
1      874
Name: is_violent, dtype: int64

# Convert to datetime and take out only 2022 and 2023

In [None]:
#convert to release_date columnt to datetime
tn_movie_budgets['release_date'] = pd.to_datetime(tn_movie_budgets['release_date'])
#set index to datetime
tn_movie_budgets.set_index('release_date', inplace=True)
#slice out our timeframe
recent_movie_budgets = deepcopy(tn_movie_budgets.sort_index().loc['01-01-2010':'12-31-2022'])
recent_movie_budgets.reset_index(inplace = True)

# Convert the affiliated groups to lists

In [46]:
df1['actor1_split'] = df1['ACTOR1'].str.split("; ")
df1['assoc_actor_1_split'] = df1['ASSOC_ACTOR_1'].str.split("; ")
df1['actor2_split'] = df1['ACTOR2'].str.split("; ")
df1['assoc_actor_2_split'] = df1['ASSOC_ACTOR_2'].str.split("; ")

In [62]:
filtered_notes_50 = df1.loc[(df1['is_violent'] == 1) & (df1['INTERACTION'] == 50), "NOTES"]

# Display the first 30 rows of the filtered data
print(filtered_notes_50.head(30))

3680                                                                                                                                                                                                                                                                                                                                                                                                                                                       On 27 May 2020, several hundred people marched around the Civic Center in Los Angeles - Central (California) in support of Black Lives Matter and against police brutality and the death of George Floyd. Demonstrators blocked the freeway and surrounded a California Highway Patrol car, vandalizing it during the confrontation. One demonstrator was injured after falling off the CHP cruiser as it drove away. The injured demonstrator received medical attention.
3745                                                                                              

In [63]:
filtered_notes_55 = df1.loc[(df1['is_violent'] == 1) & (df1['INTERACTION'] == 55), "NOTES"]

# Display the first 30 rows of the filtered data
print(filtered_notes_55.head(30))

6477                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [65]:
# df1[['assoc_actor_1_split'].value_counts()
filtered_notes_57 = df1.loc[(df1['is_violent'] == 1) & (df1['INTERACTION'] == 57), "NOTES"]

# Display the first 30 rows of the filtered data
print(filtered_notes_57.head(30))

5465                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  On 1 June 2020, about 200 people staged a march in Stafford (Virginia) in support of the Black Lives Matter moveme

In [59]:
df1.loc[(df1['is_violent'] == 1)]['INTERACTION'].value_counts()

16    412
15    300
50     51
36     41
55     34
57     15
56     13
68      5
35      2
58      1
Name: INTERACTION, dtype: int64

In [61]:
# df1[['assoc_actor_1_split'].value_counts()
filtered_notes_36 = df1.loc[(df1['is_violent'] == 1) & (df1['INTERACTION'] == 36), "NOTES"]

# Display the first 30 rows of the filtered data
print(filtered_notes_36.head(30))

4185                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          On 30 May 2020, in Visalia (California), two women were run over by a Jeep during a Black Lives Matter protest over the killing of George Floyd. Authorities are still investigating the incident and the 

In [79]:
# Define the keywords you want to search for
keywords = ['drive', 'drove', 'driver', 'driving']
#  ' car ', ' truck ', ' motorcycle ' <-- add these for more vehicle stuff


# Use the apply method to create the 'vehicle_involved' column
df1['vehicle_involved'] = df1['NOTES'].apply(lambda x: any(keyword in str(x).lower() for keyword in keywords))

# Convert the boolean True/False to 1/0
df1['vehicle_involved'] = df1['vehicle_involved'].astype(int)

In [80]:
df1['vehicle_involved'].value_counts()

0    17909
1      567
Name: vehicle_involved, dtype: int64

In [70]:
df1.loc[(df1['is_violent'] == 1)]['vehicle_involved'].value_counts()

0    803
1     71
Name: vehicle_involved, dtype: int64

In [72]:
df1.loc[(df1['is_violent'] == 1)]['crowd_size_numerical'].value_counts()

200.0     239
2000.0     68
24.0       67
100.0      64
300.0      57
50.0       43
110.0      42
30.0       22
150.0      20
12.0       17
15.0       17
40.0       16
10.0       15
48.0       15
60.0       14
5.0        14
1000.0     13
20.0       13
1020.0     13
115.0      12
210.0      10
55.0        9
305.0       8
1010.0      7
70.0        7
500.0       7
250.0       6
75.0        6
36.0        6
105.0       6
195.0       5
205.0       4
155.0       4
80.0        3
35.0        2
400.0       1
25.0        1
95.0        1
Name: crowd_size_numerical, dtype: int64

# Drop the NaN's for events without affiliated groups

In [88]:
# df[(df['ASSOC_ACTOR_1'].isna() & df['ASSOC_ACTOR_2'.isna()].sum()
unknown_actors = df[(df['ASSOC_ACTOR_1'].isna()) & (df['ASSOC_ACTOR_2'].isna())].shape[0]
print(f'Number of rows where both ASSOC_ACTOR_1 and ASSOC_ACTOR_2 are NaN: {unknown_actors}')

Number of rows where both ASSOC_ACTOR_1 and ASSOC_ACTOR_2 are NaN: 13432


# SPLC Hate Groups

In [82]:
df_hate = pd.read_csv('splc-hate-groups-2022.csv')

In [86]:
df_hate['MainGroup'] = df_hate['Title'].apply(lambda x: x.split('-')[0].strip())

# Get unique main group names
unique_main_groups = df_hate['MainGroup'].unique()

# Convert the array of unique values to a list
hate_list = list(unique_main_groups)

print(hate_list)



# Regex the crowdsize stuff

In [None]:
df['crowd_size_numerical'].value_counts()

In [None]:
{'hundred': 100, 
hundreds: 100}

# !!PRINT THE NEW FILES FOR EXPLORATION DO THIS FIRST TOMORROW!!

In [None]:
# df.to_csv(file_name, encoding='utf-8', index=False)

# Events that happen at same lat/long on same day or next day

In [None]:
strategic_developments = df.loc[df['DISORDER_TYPE']== 'Strategic developments']

In [None]:
strategic_developments.loc[strategic_developments["SUB_EVENT_TYPE"]=="Change to group/activity"]["NOTES"].head(20)
# df.loc[df['SUB_EVENT_TYPE']== 'Disrupted weapons use']["NOTES"].head()

In [None]:
# df[df['SUB_EVENT_TYPE' == 'Disrupted weapons use']]
df.loc[df['SUB_EVENT_TYPE']== 'Disrupted weapons use']["NOTES"].head(30)

In [None]:
df[]

# Preprocess data

In [None]:
X = data.drop('target_column', axis=1)
y = data['target_column']

# Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Feature Scaling

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Regression or classification model fit

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make Predictions


In [None]:
y_pred = model.predict(X_test)


# Evaluate

In [None]:
mse = mean_squared_error(y_test, y_pred) #regression
print(f'Mean Squared Error: {mse}')
accuracy = accuracy_score(y_test, y_pred) #classification
print(f'Accuracy: {accuracy}')

# TRASH ZONEEEE

In [None]:
import re

def parse_crowd_size(crowd_size_string):
    # Remove 'crowd size=' prefix
    crowd_size_string = crowd_size_string.replace('crowd size=', '').lower()
    
    # Handle simple cases (exact number)
    match = re.match(r'^(\d+)$', crowd_size_string)
    if match:
        return int(match.group(1))
    
    # Handle ranges
    match = re.match(r'.*(\d+)\s*to\s*(\d+).*', crowd_size_string)
    if match:
        lower = int(match.group(1))
        upper = int(match.group(2))
        return (lower + upper) / 2
    
    # Handle qualitative descriptions
    if 'dozen' in crowd_size_string:
        match = re.match(r'.*over\s*(\d+)\s*dozen.*', crowd_size_string)
        return (int(match.group(1)) + 1) * 12 if match else 12
    
    if 'hundred' in crowd_size_string:
        match = re.match(r'.*over\s*(\d+)\s*hundred.*', crowd_size_string)
        return (int(match.group(1)) + 1) * 100 if match else 100
    
    # Additional cases and logic can be added here
    
    # Return None if the string could not be parsed
    return None

# Example usage
# crowd_sizes = [
#     'crowd size=over 24',
#     'crowd size=140',
#     'crowd size=approximately 300',
#     'crowd size=35 to 40',
#     'crowd size=approximately between 24-150+',
#     # ...
# ]

# Parse the crowd sizes
parsed_sizes = [parse_crowd_size(cs) for cs in crowd_size_list]

# Display the parsed sizes
print(parsed_sizes)


In [None]:
print([[x,parsed_sizes.count(x)] for x in set(parsed_sizes)])

# print(dict((x,parsed_sizes.count(x)) for x in set(parsed_sizes)))