# Spring Cleaning!

Harold's stock data is a mess! Help him clean up his data before the auditors arrive!

In [36]:
# Import Libraries
import pandas as pd
from pathlib import Path

### Load CSV data into Pandas using `read_csv`

In [37]:
filepath = Path("../../Resources/stock_data.csv")
stock_df = pd.read_csv(filepath)

### Identify the number of rows and columns (shape) in the DataFrame.

In [38]:
stock_df.shape

(504, 14)

### Generate a sample of the data to visually ensure data has been loaded in correctly.

In [39]:
stock_df.head(10)

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
0,MMM,3M Company,Industrials,$222.89,24.31,2.332862,$7.92,259.77,175.49,138721100000.0,9048000000.0,4.390271,11.34,http://www.sec.gov/cgi-bin/browse-edgar?action...
1,AOS,A.O. Smith Corp,Industrials,,,,,,,,,,,
2,ABT,Abbott Laboratories,Health Care,56.27,22.51,1.908982,0.26,64.6,42.28,102121000000.0,5744000000.0,3.74048,3.19,http://www.sec.gov/cgi-bin/browse-edgar?action...
3,ABBV,AbbVie Inc.,Health Care,108.48,19.41,2.49956,3.29,125.86,60.05,181386300000.0,10310000000.0,6.291571,26.14,http://www.sec.gov/cgi-bin/browse-edgar?action...
4,ATVI,Activision Blizzard,Information Technology,65.83,,0.431903,1.28,74.945,38.93,52518670000.0,2704000000.0,10.59512,5.16,http://www.sec.gov/cgi-bin/browse-edgar?action...
5,AYI,Acuity Brands Inc,Industrials,108.48,18.22,0.351185,7.43,225.36,142.0,6242378000.0,587800000.0,1.795347,3.55,http://www.sec.gov/cgi-bin/browse-edgar?action...
6,ADBE,Adobe Systems Inc,Information Technology,185.16,52.31,0.0,3.39,204.45,114.451,94550210000.0,2538040000.0,13.092818,11.06,http://www.sec.gov/cgi-bin/browse-edgar?action...
7,AAP,Advance Auto Parts,Consumer Discretionary,109.63,19.54,0.218321,6.19,169.55,78.81,8123612000.0,853941000.0,1.130106,2.51,http://www.sec.gov/cgi-bin/browse-edgar?action...
8,AMD,Advanced Micro Devices Inc,Information Technology,$11.22,187.0,0.0,0.03,15.65,9.7,11191660000.0,,2.109195,21.47,http://www.sec.gov/cgi-bin/browse-edgar?action...
9,AES,,,,,,,,,,,,,


### Identify the number of records in the DataFrame, and compare it with the number of rows in the original file.

In [40]:
print(f"Number of dataframe records: {len(stock_df.index)}")
with open(filepath, 'r') as file:
    print(f"Number of rows in original file: {len(file.readlines())}")

Number of dataframe records: 504
Number of rows in original file: 505


### Identify nulls records

In [41]:
stock_df.isnull().sum().sum()

71

### Drop Null Records

In [42]:
stock_df = stock_df.dropna()

### Validate nulls have been dropped

In [43]:
stock_df.isnull().sum().sum()

0

### Default null `ebitda` values to 0. Then, validate no records are null for ebitda.

In [44]:
stock_df["ebitda"] = stock_df["ebitda"].fillna(0)
stock_df["ebitda"].isnull().sum()

0

### Drop Duplicates

In [47]:
# check for duplicates
print(f"initial duplicate check: {stock_df.duplicated().sum()}")
# remove duplicates
stock_df.drop_duplicates()
# check for duplicates
print(f"following duplicate check: {stock_df.duplicated().sum()}")

initial duplicate check: 0
following duplicate check: 0


### Sample `price` field

In [52]:
stock_df["price"].sample()

194    46.46
Name: price, dtype: object

### Clean `price` Series by replacing `$`

In [55]:
stock_df["price"] = stock_df["price"].str.replace("$","")
stock_df["price"]

  stock_df["price"] = stock_df["price"].str.replace("$","")


0      222.89
2       56.27
3      108.48
5      108.48
6      185.16
        ...  
499     70.24
500      76.3
501    115.53
502     50.71
503     71.51
Name: price, Length: 478, dtype: object

### Confirm data type of `price`

In [62]:
print(f"Type of the \"price\" column: {stock_df['price'].dtypes}")

Type of the "price" column: object


### Cast `price` Series as float and then validate using `dtype`

In [65]:
stock_df["price"] = stock_df["price"].astype(float)
stock_df.dtypes

symbol                 object
name                   object
sector                 object
price                 float64
price_per_earnings    float64
dividend_yield        float64
earnings_per_share     object
52_week_low           float64
52_week_high          float64
market_cap            float64
ebitda                float64
price_per_sales       float64
price_per_book        float64
sec_filings            object
dtype: object