# Spring Cleaning!

Harold's stock data is a mess! Help him clean up his data before the auditors arrive!

In [2]:
# Import Libraries
import pandas as pd
from pathlib import Path

### Load CSV data into Pandas using `read_csv`

In [3]:
csvpath= Path('../../Resources/stock_data.csv')
df= pd.read_csv(csvpath)

### Identify the number of rows and columns (shape) in the DataFrame.

In [4]:
df.shape

(504, 14)

### Generate a sample of the data to visually ensure data has been loaded in correctly.

In [5]:
df.sample(10)

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
80,AVGO,Broadcom,Information Technology,229.57,15.94,2.948858,4.01,285.68,202.61,92791970000.0,7016000000.0,6.961893,4.4,http://www.sec.gov/cgi-bin/browse-edgar?action...
1,AOS,A.O. Smith Corp,Industrials,,,,,,,,,,,
333,NKE,Nike,Consumer Discretionary,62.49,24.9,1.218955,2.51,68.83,50.35,106776100000.0,5162000000.0,3.054993,8.91,http://www.sec.gov/cgi-bin/browse-edgar?action...
380,PVH,PVH Corp.,Consumer Discretionary,142.68,20.98,0.100529,6.77,157.96,84.53,11478630000.0,1057500000.0,1.761752,2.14,http://www.sec.gov/cgi-bin/browse-edgar?action...
386,RJF,Raymond James Financial Inc.,Financials,86.06,16.94,1.098298,4.12,99.1,71.35,13216270000.0,0.0,1.973456,2.34,http://www.sec.gov/cgi-bin/browse-edgar?action...
235,HST,Host Hotels & Resorts,Real Estate,18.75,11.23,4.113111,1.02,21.53,17.26,14394720000.0,1547000000.0,3.554913,2.02,http://www.sec.gov/cgi-bin/browse-edgar?action...
306,MGM,MGM Resorts International,Consumer Discretionary,33.5,29.65,1.268743,1.92,38.41,25.15,19633670000.0,2680385000.0,2.227672,3.09,http://www.sec.gov/cgi-bin/browse-edgar?action...
184,XOM,Exxon Mobil Corp.,Energy,76.07,21.37,4.00312,1.88,89.3,76.05,326148700000.0,39052000000.0,1.770194,1.85,http://www.sec.gov/cgi-bin/browse-edgar?action...
33,AXP,American Express Co,Financials,88.34,15.0,1.495567,2.9,102.385,75.51,80410990000.0,0.0,2.273575,3.75,http://www.sec.gov/cgi-bin/browse-edgar?action...
418,SWK,Stanley Black & Decker,Consumer Discretionary,152.86,20.57,1.577564,8.05,176.62,121.09,24496400000.0,2264600000.0,1.944325,3.34,http://www.sec.gov/cgi-bin/browse-edgar?action...


### Identify the number of records in the DataFrame, and compare it with the number of rows in the original file.

In [6]:
df.count()

symbol                504
name                  502
sector                501
price                 500
price_per_earnings    497
dividend_yield        499
earnings_per_share    498
52_week_low           500
52_week_high          500
market_cap            500
ebitda                492
price_per_sales       500
price_per_book        492
sec_filings           500
dtype: int64

### Identify nulls records

In [12]:
df.isnull().mean()*100

symbol                0.0
name                  0.0
sector                0.0
price                 0.0
price_per_earnings    0.0
dividend_yield        0.0
earnings_per_share    0.0
52_week_low           0.0
52_week_high          0.0
market_cap            0.0
ebitda                0.0
price_per_sales       0.0
price_per_book        0.0
sec_filings           0.0
dtype: float64

### Drop Null Records

In [9]:
df.dropna(inplace= True)

### Validate nulls have been dropped

In [10]:
df.isnull().sum()

symbol                0
name                  0
sector                0
price                 0
price_per_earnings    0
dividend_yield        0
earnings_per_share    0
52_week_low           0
52_week_high          0
market_cap            0
ebitda                0
price_per_sales       0
price_per_book        0
sec_filings           0
dtype: int64

### Default null `ebitda` values to 0. Then, validate no records are null for ebitda.

In [15]:
df['ebitda']= df['ebitda'].fillna(0)
df['ebitda'].isnull().sum()

0

### Drop Duplicates

In [None]:
df.drop_duplicates(inplace= True)