In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# For other editors just put the pathings to a holder folder

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))

ram_price = '/kaggle/input/ethereum-effect-pc-parts/FACT_RAM_PRICE.csv'
ram_product = '/kaggle/input/edited-dim-ram-prod/DIM_RAM_PROD-1.csv'
ram_price_df = pd.read_csv(ram_price)
ram_product_df = pd.read_csv(ram_product)

# merge the dataframes by ID
merged_ram = ram_price_df.merge(ram_product_df, left_on='ProdId', right_on='Id')

for x in range(100):
    if x % 2 == 1 or x % 5 == 0:
        merged_ram.loc[x, 'Price_USD'] = np.NaN
    if x % 2 == 0 and x % 3 == 0:
        merged_ram.loc[x, 'Speed'] = np.NaN

# shortens dataframe to make it manageable
merged_ram = merged_ram.head(100)

# interpolate the data
merged_ram = merged_ram.interpolate()

# interpolating doesn't remove the first NaN values, so we use bfill to get rid of those
merged_ram = merged_ram.bfill()

# now we filter by pricing
merged_ram = merged_ram[merged_ram['Price_USD'] < 13]
merged_ram = merged_ram.reset_index()

# an index column is added from when we reset the index, so we drop this column
merged_ram = merged_ram.drop('index', axis=1)
merged_ram

# more random ways of inserting NA values
for x in range(len(merged_ram)):
    if x % 2 == 1:
        merged_ram.loc[x, 'Speed'] = np.NaN
    if x % 3 == 0:
        merged_ram.loc[x, 'Price_Original'] = np.NaN
        
        
merged_ram = merged_ram.fillna(0)
merged_ram = merged_ram.sort_values(by='Price_Original')

# now we have an extremely small dataframe with a particular set of gpus (only 12 left)
# take out the rows that were filled with 0s using filtering
merged_ram = merged_ram[(merged_ram['Price_Original'] != 0) & (merged_ram['Speed'] != 0)]

# 4 GPUS priced less than 13 USD whos original index was not used to insert NA values
display(merged_ram)

gpu_price = '/kaggle/input/ethereum-effect-pc-parts/FACT_GPU_PRICE.csv'
gpu_product = '/kaggle/input/ethereum-effect-pc-parts/DIM_GPU_PROD.csv'

gpu_price_df = pd.read_csv(gpu_price)
gpu_product_df = pd.read_csv(gpu_product)

# merges the price and gpu dataframes by IDs
merged_gpu = gpu_price_df.merge(gpu_product_df, how='inner', left_on='ProdId', right_on='Id')
merged_gpu = merged_gpu.drop('Id', axis=1)

# average price before inserting NA values and filling them
before = merged_gpu['Price_USD'].mean()

# random ways to insert NaN values into the rows
for x in range(100):
    # for odd rows, make the price = NA
    if x % 2 == 1:
        merged_gpu.loc[x, 'Price_USD'] = np.NaN
    
    # for each row with an index divisible by 4, set the price = NA
    if x % 4 == 0:
        merged_gpu.loc[x, 'Price_USD'] = np.NaN
        
    # if there is an M in the GPU manufacturer for that GPU, set the original price and memory capacity to NA
    elif 'M' in merged_gpu.loc[x, 'GPU_Manufacturer']:
        merged_gpu.loc[x, 'Price_Original'] = np.NaN
        merged_gpu.loc[x, 'Memory_Capacity'] = np.NaN

# made merged_gpu only the first 100 rows of the original dataframe, since it originally had over a million rows
merged_gpu = merged_gpu.head(100)

# use bfill first to fill every NA value EXCEPT for the last row
# in order to fill the last NA value, use ffill after bfill
merged_gpu = merged_gpu.ffill().bfill()

# get the average price in USD after filling NA values
after = merged_gpu['Price_USD'].mean()

# print the difference in average prices
print(before," ",after)

# display the dataframe
display(merged_gpu)

Unnamed: 0,ProdId,TimeId,RegionId,MerchantId,Price_USD,Price_Original,Id,Manufacturer,RAM_Name,Memory_Type,Speed,Capacity
2,1,20130622,4,32,12.936925,9.795,1,ADATA,Adata,DDR,400.0,0.5
4,1,20130624,4,32,12.853993,9.795,1,ADATA,Adata,DDR,400.0,0.5
8,1,20130628,4,32,12.868519,9.87,1,ADATA,Adata,DDR,400.0,0.5
10,1,20130630,4,32,12.860377,9.885,1,ADATA,Adata,DDR,400.0,0.5


450.519616920738   549.0708562944441


Unnamed: 0,ProdId,TimeId,RegionId,MerchantId,Price_USD,Price_Original,Processor_Manufacturer,Processor,GPU_Manufacturer,Memory_Capacity,Memory_Type
0,1,20140917,4,32,548.098896,463.90,AMD,C420,Matrox,2.0,GDDR5
1,1,20140918,4,32,548.098896,463.90,AMD,C420,Matrox,2.0,GDDR5
2,1,20140919,4,32,548.098896,463.90,AMD,C420,Matrox,2.0,GDDR5
3,1,20140920,4,32,548.098896,463.90,AMD,C420,Matrox,2.0,GDDR5
4,1,20140921,4,32,548.098896,424.53,AMD,C420,Matrox,2.0,GDDR5
...,...,...,...,...,...,...,...,...,...,...,...
95,1,20141222,4,32,547.187810,447.43,AMD,C420,Matrox,2.0,GDDR5
96,1,20141223,4,32,547.187810,449.46,AMD,C420,Matrox,2.0,GDDR5
97,1,20141224,4,32,547.187810,449.46,AMD,C420,Matrox,2.0,GDDR5
98,1,20141225,4,32,549.943733,449.46,AMD,C420,Matrox,2.0,GDDR5
