In [1]:
# Dependencies
import pandas as pd

In [2]:
# Name of the CSV file
file = '../Resources/donors2021_unclean.csv'

In [3]:
# The correct encoding must be used to read the CSV
df = pd.read_csv(file, encoding="ISO-8859-1")

In [10]:
# Preview of the DataFrame
# Note that Memo_CD is likely a meaningless column

print(df['Memo_CD'].value_counts())

df.head()

Series([], Name: count, dtype: int64)


Unnamed: 0,Name,Employer,City,State,Zip,Amount,Memo_CD
0,"CAREY, JAMES",NOT EMPLOYED,HOCKESSIN,DE,197071618.0,500,
1,"OBICI, SILVANA",STONY BROOK,PORT JEFFERSON STATION,NY,117764286.0,250,
2,"MAISLIN, KAREN",RETIRED,WILLIAMSVILLE,NY,14221.0,250,
3,"MCCLELLAND, CARTER AND STEPHANIE",UNION SQUARE ADVISORS,NEW YORK,NY,10023.0,1000,
4,"MCCLUSKEY, MARTHA",STATE UNIVERSITY OF NEW YORK,BUFFALO,NY,14214.0,250,


In [13]:
# Delete extraneous column
del df["Memo_CD"]

In [14]:
df.head()

Unnamed: 0,Name,Employer,City,State,Zip,Amount
0,"CAREY, JAMES",NOT EMPLOYED,HOCKESSIN,DE,197071618.0,500
1,"OBICI, SILVANA",STONY BROOK,PORT JEFFERSON STATION,NY,117764286.0,250
2,"MAISLIN, KAREN",RETIRED,WILLIAMSVILLE,NY,14221.0,250
3,"MCCLELLAND, CARTER AND STEPHANIE",UNION SQUARE ADVISORS,NEW YORK,NY,10023.0,1000
4,"MCCLUSKEY, MARTHA",STATE UNIVERSITY OF NEW YORK,BUFFALO,NY,14214.0,250


In [15]:
# Identify incomplete rows
df.count()

Name        2000
Employer    1820
City        1999
State       1999
Zip         1996
Amount      2000
dtype: int64

In [16]:
# Drop all rows with missing information
df = df.dropna(how="any")

In [17]:
# Verify dropped rows
df.count()

Name        1818
Employer    1818
City        1818
State       1818
Zip         1818
Amount      1818
dtype: int64

In [19]:
# The Zip column is the wrong data type. It should be a string (object).
type(df)

df.dtypes

Name         object
Employer     object
City         object
State        object
Zip         float64
Amount        int64
dtype: object

In [23]:
# Use df.astype() method to convert the datatype of the Zip column
df = df.astype({"Zip" : str}, errors = "raise")

df.dtypes

Name        object
Employer    object
City        object
State       object
Zip         object
Amount       int64
dtype: object

In [24]:
# Verify that the Zip column datatype has been made an object
df['Zip'].dtype

dtype('O')

In [25]:
# We can view all the unique values in a column
df['Employer'].unique()

array(['NOT EMPLOYED', 'STONY BROOK', 'RETIRED', 'UNION SQUARE ADVISORS',
       'STATE UNIVERSITY OF NEW YORK', 'SELF', 'SETH PYENSON',
       'BACKSTOP SOLUTIONS', 'TAYLOR RING', 'ICI MUTUAL',
       'WEBB SANDERS & WILLIAMS PLLC', 'ALAMEDA COUNTY',
       'THE COALITION AGAINST TRAFFICKING IN W', 'GOOGLE LLC',
       'CASE WESTERN RESERVE UNIVERSITY', 'NONE', 'VELOCITYEHS',
       'UC BERKELEY', 'MLK LLC', 'CISCO', 'STATE OF OHIO', 'FANATICS INC',
       'SOFTWARE', 'MANAGER', 'CONSULTANT',
       'PAN-AMERICAN LIFE INSURANCE GROUP', 'BCR PROPERTIES',
       'WYNNEFIELD CAPITAL', 'JOHNS HOPKINS UNIVERSITY',
       'COLUMBIA UNIVERSITY', 'COHERENT SYSTEMS', 'CLEVELAND-CLIFFS INC.',
       'IRONUNITS LLC (CCI)', 'IRONUNITSLLC (CCI)', 'IRONUNITS (CCI)',
       'ASANTE PHYSICIAN PARTNERS', 'UPS', 'CHICAGO BAR COMPANY',
       'OVERTON SECURITY SERVICES', 'DRUMMOND COMPANY, INC.',
       'INGRAM BARGE', 'INGRAM BARGE COMPANY', 'INGRAM',
       'INGRAM INDUSTRIES INC.', 'INGRAM INDUSTRIES

In [26]:
# We can count the number of unique values
df['Employer'].nunique()

519

In [27]:
# We can count the number of occurrences
# for each unique value
df['Employer'].value_counts()

Employer
NOT EMPLOYED                        609
NONE                                321
SELF-EMPLOYED                       132
SELF                                 33
RETIRED                              32
                                   ... 
NOKIA CORP                            1
FH MINE SUPPLY INC.                   1
DREYER INTERNATIONAL ACADEMY LLC      1
RAY GRAHAM ASSOCIATION                1
5T WEALTH, LLC                        1
Name: count, Length: 519, dtype: int64

In [28]:
# Clean up Employer category. Replace 'SELF' and 'SELF EMPLOYED' with 'SELF-EMPLOYED'
df['Employer'] = df["Employer"].replace({"SELF" : 'SELF-EMPLOYED', "SELF EMPLOYED" : "SELF-EMPLOYED"})

In [29]:
# Verify clean-up.
df['Employer'].value_counts()

Employer
NOT EMPLOYED                            609
NONE                                    321
SELF-EMPLOYED                           180
RETIRED                                  32
INGRAM BARGE COMPANY                     30
                                       ... 
GOOGLE LLC                                1
BP INDUSTRIES INC                         1
HOT SPRINGS COUNTY DISTRICT HOSPITAL      1
INVEST AMERICA REALTY                     1
5T WEALTH, LLC                            1
Name: count, Length: 517, dtype: int64

In [31]:
# Clean up Employer category. Replace 'NOT EMPLOYED' with 'UNEMPLOYED'
df['Employer'] = df["Employer"].replace({"NOT EMPLOYED" : 'UNEMPLOYED'})
df['Employer'].value_counts()

Employer
UNEMPLOYED                        611
NONE                              321
SELF-EMPLOYED                     180
RETIRED                            32
INGRAM BARGE COMPANY               30
                                 ... 
JEROME'S COLLISION CENTER           1
LINDQUIST MORTIARIES                1
GAINESVILLE SKIN CANCER CENTER      1
RYAN SPECIALTYGROUP                 1
5T WEALTH, LLC                      1
Name: count, Length: 516, dtype: int64

In [32]:
# Display a statistical overview
df.describe()

Unnamed: 0,Amount
count,1818.0
mean,752.127613
std,11601.791128
min,-1000.0
25%,25.0
50%,50.0
75%,200.0
max,400000.0


In [33]:
# We can calculate individual aggregate functions per column
print("AMOUNT:")
print(f"The count is {df['Amount'].count()}")
print(f"The minimum is {df['Amount'].min()}")
print(f"The maximum is {df['Amount'].max()}")
print(f"The mean is {df['Amount'].mean()}")

AMOUNT:
The count is 1818
The minimum is -1000
The maximum is 400000
The mean is 752.1276127612762


In [37]:
# We can also calculate them for an entire DataFrame
df.min()

Name        ABATA, JOSEPH
Employer              11X
City                ACTON
State                  AK
Zip           100032604.0
Amount              -1000
dtype: object

In [39]:
df

Unnamed: 0,Name,Employer,City,State,Zip,Amount
0,"CAREY, JAMES",UNEMPLOYED,HOCKESSIN,DE,197071618.0,500
1,"OBICI, SILVANA",STONY BROOK,PORT JEFFERSON STATION,NY,117764286.0,250
2,"MAISLIN, KAREN",RETIRED,WILLIAMSVILLE,NY,14221.0,250
3,"MCCLELLAND, CARTER AND STEPHANIE",UNION SQUARE ADVISORS,NEW YORK,NY,10023.0,1000
4,"MCCLUSKEY, MARTHA",STATE UNIVERSITY OF NEW YORK,BUFFALO,NY,14214.0,250
...,...,...,...,...,...,...
1995,"KROMHOUT, WILEEN",UCLA,PORTER RANCH,CA,913263018.0,57
1996,"KRONHEIM, NANCY",UNEMPLOYED,ARLINGTON,VA,222041332.0,100
1997,"KROOP, STEVE","FLYTEC, USA",HIGH SPRINGS,FL,326431608.0,39
1998,"KRSEK, PAUL","5T WEALTH, LLC",NAPA,CA,945581818.0,20


In [38]:
# Save the DataFrame to a CSV file. 
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html
df.to_csv("cleaned_donors.csv", index=False)