In [10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

In [11]:
with open('../datasets/television_labels.txt', 'r') as f:
    lines = f.readlines()
for line in lines:
    print(line)

Energy Rating Data Dictionary for Televisions (labelled) as at May 2015



Column - Column Identifier - Description

Submit_ID - Registration.pk - This is the unique registration ID record for the product and is taken from the GEMS product database

Brand_Reg - Model.brand_name - This is the manufacturers brand

Model_No - Model.model_number - This is the model number of the appliance, usually a unique field

SoldIn - Registration.selling_countries - These are the countries where the product is registered for sale and may include Australia, New Zealand and/or Fiji

Country - Registration.registrationmanufacturingcountry_set - Country of manufacture

screensize - screen_size - The diagonal measurement of the screen in cm

Screen_Area - screen_area - This is the screen area (Height x Width) in cm2

Screen_Tech - screen_tech - This is the type of screen technology (LCD, Plasma, OLED, etc…)

Pasv_stnd_power - pasv_stnd_power - This is the amount of energy used by the appliance in passive s

In [12]:
data = pd.read_csv('../datasets/television.csv')
print(data.info())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2513 entries, 0 to 2512
Data columns (total 31 columns):
Submit_ID                      2513 non-null int64
Brand_Reg                      2513 non-null object
Model_No                       2513 non-null object
Family Name                    755 non-null object
SoldIn                         2463 non-null object
Country                        2513 non-null object
screensize                     2513 non-null float64
Screen_Area                    2513 non-null float64
Screen_Tech                    2513 non-null object
Pasv_stnd_power                2453 non-null float64
Act_stnd_power                 2452 non-null float64
Act_stnd_time                  2452 non-null float64
Avg_mode_power                 2513 non-null float64
Star                           438 non-null float64
SRI                            438 non-null float64
CEC                            2513 non-null float64
SubmitStatus                   2513 non-null object
ExpD

In [13]:
# Let us first find see the features which we might have to drop
print(data.columns.values)

# A relatively good dataset since the important features are non-empty, we can remove the irrelevant columns
print(data.isnull().sum(), '\n')

['Submit_ID' 'Brand_Reg' 'Model_No' 'Family Name' 'SoldIn' 'Country'
 'screensize' 'Screen_Area' 'Screen_Tech' 'Pasv_stnd_power'
 'Act_stnd_power' 'Act_stnd_time' 'Avg_mode_power' 'Star' 'SRI' 'CEC'
 'SubmitStatus' 'ExpDate' 'GrandDate' 'Product Class'
 'Availability Status' 'Star2' 'Product Website'
 'Representative Brand URL' 'Star Rating Index' 'Star Image Large'
 'Star Image Small' 'Power supply' 'Tuner Type'
 'What test standard was used' 'Registration Number']
Submit_ID                         0
Brand_Reg                         0
Model_No                          0
Family Name                    1758
SoldIn                           50
Country                           0
screensize                        0
Screen_Area                       0
Screen_Tech                       0
Pasv_stnd_power                  60
Act_stnd_power                   61
Act_stnd_time                    61
Avg_mode_power                    0
Star                           2075
SRI                      