# Clean Usage Dataset

---
### <i>Changelogs:</i>

  Name  |  Date  |   Description
- **Bao Tran**  |  03/17  |  Create notebook, clean data in firmographic dataset.Reorganize folder and files.
- **Kiet Vu**  |  03/17  | Minor Editing.
---

In [1]:
#Import libraries
import pandas as pd
import numpy as np

In [2]:
#Open csv source
data = pd.read_csv("Raw Data/usage_dataset_20230306.csv")
data.head(15)

Unnamed: 0,unique_identifier,usage,log_usage,status
0,2ac7d64df7018a4137d7c5cf98c40061,6256.8,3.796352,ACTIVE
1,26fe8b75ad11c751336e76ef00aa9c29,8638.0,3.936413,FINALLED
2,26fe8b75ad11c751336e76ef00aa9c29,24173.0,4.383331,FINALLED
3,afcd5e3beba66fb8ca35625557de98af,11000.0,4.041393,FINALLED
4,e7df36f919ffd1d6ab666ac6edd995aa,12823.0,4.10799,FINALLED
5,309e4667ec44b5ff789b05075637fd2f,44156.0,4.64499,FINALLED
6,7de37d6386e8af4fda4a965766136ad1,5913.0,3.771808,ACTIVE
7,4dc18e24bce7f6c1894400fb7b25f450,30380.0,4.482588,ACTIVE
8,019b70147b78a066e76e6e9cb7cc40bb,20313.0,4.307774,ACTIVE
9,85dc06d71276a5802eec502e21095be9,10387.0,4.01649,FINALLED


For each firm, we take the sum of usage of all status each has. 

In [3]:
#Aggregate usage by unique id
df_usage = data.groupby(["unique_identifier"]).agg({'usage':'sum'})
df_usage

Unnamed: 0_level_0,usage
unique_identifier,Unnamed: 1_level_1
0001230a214b39e0e5c463bfe440fb15,81440.0
000345e997e72b61b990d2689c76427f,556.3
0003c4d7aeb24f319f0d7c6ddb60bb8f,32564.0
00082675e86a9f3cf5fdcc5d4cd9114d,5519.0
00095201031df44962513f378842d521,5946.0
...,...
fff7c0d1b2f896b1018ef67a9d286361,11980.0
fffce8918ebaae88423f62806f22c414,11080.0
fffd155082881fa090b08ca6ceed7005,93876.0
fffe3fcb6ca0166ec15af3958ac145e8,9072.0


In [4]:
#Get only "ACTIVE" firms
df_active = data[data['status'] == 'ACTIVE' ][["unique_identifier","status"]].drop_duplicates()
df_active.head(15)

Unnamed: 0,unique_identifier,status
0,2ac7d64df7018a4137d7c5cf98c40061,ACTIVE
6,7de37d6386e8af4fda4a965766136ad1,ACTIVE
7,4dc18e24bce7f6c1894400fb7b25f450,ACTIVE
8,019b70147b78a066e76e6e9cb7cc40bb,ACTIVE
11,4f478a7903a1acdba35ab3cb60791cc5,ACTIVE
13,8fba9b2aadf0956b2fe8f7e8912085d1,ACTIVE
15,02c0ab0c3404183d3f15e6c6c1f93d68,ACTIVE
17,6ef8a95463942395dc046fc1322d5e87,ACTIVE
20,ae773a6b4cfd9b2bc11a4c469ee7dfa2,ACTIVE
22,923e379b49e0aa5eae54dbc5f0662f31,ACTIVE


In [5]:
#Filter and Remove "ACTIVE" firms from "FINALLED" firms dataframe
active_list = df_active['unique_identifier'].values.tolist()
df_finalled= data[data['status'] == 'FINALLED' ][["unique_identifier","status"]].drop_duplicates()
df_finalled = df_finalled[~df_finalled['unique_identifier'].isin(active_list)]
df_finalled.head(15)

Unnamed: 0,unique_identifier,status
1,26fe8b75ad11c751336e76ef00aa9c29,FINALLED
3,afcd5e3beba66fb8ca35625557de98af,FINALLED
4,e7df36f919ffd1d6ab666ac6edd995aa,FINALLED
5,309e4667ec44b5ff789b05075637fd2f,FINALLED
9,85dc06d71276a5802eec502e21095be9,FINALLED
12,c993ae9e457955b3c13f3162539e615b,FINALLED
16,994b9644033a3bfe004e9d54ea232e51,FINALLED
18,b81153b6af9da7901392158e433d5bfa,FINALLED
19,76a070bd4c068d5a828474bb8af54dc4,FINALLED
21,ed98ee2bc222b3eb378894ad13b668a5,FINALLED


In [6]:
#Filter and Remove "ACTIVE" and "FINALLED" firms from "EXPECT FINALLED" firms dataframe
finalled_list = df_finalled['unique_identifier'].values.tolist()
value_list = active_list + finalled_list
df_expect = data[data['status'] == 'EXPECT FINALLED' ][["unique_identifier","status"]].drop_duplicates()
df_expect = df_expect[~df_expect['unique_identifier'].isin(value_list)]

In [7]:
#Create table of each record with its status
df_status = pd.concat([df_active,df_finalled,df_expect], ignore_index=True)
df_status.head(5)

Unnamed: 0,unique_identifier,status
0,2ac7d64df7018a4137d7c5cf98c40061,ACTIVE
1,7de37d6386e8af4fda4a965766136ad1,ACTIVE
2,4dc18e24bce7f6c1894400fb7b25f450,ACTIVE
3,019b70147b78a066e76e6e9cb7cc40bb,ACTIVE
4,4f478a7903a1acdba35ab3cb60791cc5,ACTIVE


In [8]:
df_status.tail(5)

Unnamed: 0,unique_identifier,status
50881,ae782d21ce800df446e3862e9b387d61,EXPECT FINALLED
50882,215b411b68a49380ce6c5375747870c9,EXPECT FINALLED
50883,eb04dcc800ae8efc7818497c1187af16,EXPECT FINALLED
50884,d924b61eb8a6f41f50310986afa4d7f1,EXPECT FINALLED
50885,9f4510bbca3ed120d87d534242655ad4,EXPECT FINALLED


In [9]:
#Create a clean data frame of usage csv
df_clean = df_usage.merge(df_status, on='unique_identifier', how='left')
df_clean.head(15)

Unnamed: 0,unique_identifier,usage,status
0,0001230a214b39e0e5c463bfe440fb15,81440.0,FINALLED
1,000345e997e72b61b990d2689c76427f,556.3,ACTIVE
2,0003c4d7aeb24f319f0d7c6ddb60bb8f,32564.0,FINALLED
3,00082675e86a9f3cf5fdcc5d4cd9114d,5519.0,FINALLED
4,00095201031df44962513f378842d521,5946.0,ACTIVE
5,000a04481ee5acbb856a7c485a67423a,75468.0,FINALLED
6,000bee0b537b676a975a15999776581f,88280.0,FINALLED
7,000c88d34beda722f7b559bb056b7809,109258.0,ACTIVE
8,000f645a52095f72ec723133e2b0092c,9686.0,FINALLED
9,00109796f3c34d87f1ff2778498a8016,65700.0,FINALLED


In [10]:
#Export to csv
df_clean.to_csv('Clean Data/usage_clean_20230314.csv', index = False)