Runs through CountVectoriser and various machine learning models. (Binary relevance approach)

# Sample codes

In [0]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

# Load and split the data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# Construct some pipelines
pipe_lr = Pipeline([('scl', StandardScaler()),
			('clf', LogisticRegression(random_state=42))])

pipe_lr_pca = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', LogisticRegression(random_state=42))])

pipe_rf = Pipeline([('scl', StandardScaler()),
			('clf', RandomForestClassifier(random_state=42))])

pipe_rf_pca = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', RandomForestClassifier(random_state=42))])

pipe_svm = Pipeline([('scl', StandardScaler()),
			('clf', svm.SVC(random_state=42))])

pipe_svm_pca = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', svm.SVC(random_state=42))])
			
# Set grid search params
param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range_fl = [1.0, 0.5, 0.1]

grid_params_lr = [{'clf__penalty': ['l1', 'l2'],
		'clf__C': param_range_fl,
		'clf__solver': ['liblinear']}] 

grid_params_rf = [{'clf__criterion': ['gini', 'entropy'],
		'clf__min_samples_leaf': param_range,
		'clf__max_depth': param_range,
		'clf__min_samples_split': param_range[1:]}]

grid_params_svm = [{'clf__kernel': ['linear', 'rbf'], 
		'clf__C': param_range}]

# Construct grid searches
jobs = -1

gs_lr = GridSearchCV(estimator=pipe_lr,
			param_grid=grid_params_lr,
			scoring='accuracy',
			cv=10) 
			
gs_lr_pca = GridSearchCV(estimator=pipe_lr_pca,
			param_grid=grid_params_lr,
			scoring='accuracy',
			cv=10)
			
gs_rf = GridSearchCV(estimator=pipe_rf,
			param_grid=grid_params_rf,
			scoring='accuracy',
			cv=10, 
			n_jobs=jobs)

gs_rf_pca = GridSearchCV(estimator=pipe_rf_pca,
			param_grid=grid_params_rf,
			scoring='accuracy',
			cv=10, 
			n_jobs=jobs)

gs_svm = GridSearchCV(estimator=pipe_svm,
			param_grid=grid_params_svm,
			scoring='accuracy',
			cv=10,
			n_jobs=jobs)

gs_svm_pca = GridSearchCV(estimator=pipe_svm_pca,
			param_grid=grid_params_svm,
			scoring='accuracy',
			cv=10,
			n_jobs=jobs)

# List of pipelines for ease of iteration
grids = [gs_lr, gs_lr_pca, gs_rf, gs_rf_pca, gs_svm, gs_svm_pca]

# Dictionary of pipelines and classifier types for ease of reference
grid_dict = {0: 'Logistic Regression', 1: 'Logistic Regression w/PCA', 
		2: 'Random Forest', 3: 'Random Forest w/PCA', 
		4: 'Support Vector Machine', 5: 'Support Vector Machine w/PCA'}

# Fit the grid search objects
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
	print('\nEstimator: %s' % grid_dict[idx])	
	# Fit grid search	
	gs.fit(X_train, y_train)
	# Best params
	print('Best params: %s' % gs.best_params_)
	# Best training data accuracy
	print('Best training accuracy: %.3f' % gs.best_score_)
	# Predict on test data with best params
	y_pred = gs.predict(X_test)
	# Test data accuracy of model with best params
	print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
	# Track best (highest test accuracy) model
	if accuracy_score(y_test, y_pred) > best_acc:
		best_acc = accuracy_score(y_test, y_pred)
		best_gs = gs
		best_clf = idx
print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])

# Save best grid search pipeline to file
dump_file = 'best_gs_pipeline.pkl'
joblib.dump(best_gs, dump_file, compress=1)
print('\nSaved %s grid search pipeline to file: %s' % (grid_dict[best_clf], dump_file))

In [0]:
pipeline = Pipeline([
    # Use FeatureUnion to combine the features from description and tweetSource
    ('union', FeatureUnion(
        transformer_list=[

            ('tweetSource', Pipeline([
                ('selector', ItemSelector(key='tweetSource')),
                ("pca", PCA(n_components=5)), 
                ("univ_select", SelectKBest(k=3)),
            ])),

            # Pipeline for bag-of-words model for description
            ('description', Pipeline([
                ('selector', ItemSelector(key='description')),
                ('tfidf', TfidfVectorizer()),
                ('best', TruncatedSVD(n_components=50)),
            ])),
            
            ('flag_org_in_desc', Pipeline([
                ('selector', ItemSelector(key='flag_org_in_desc')),
                #('array', ArrayCaster()),
            ])),           
             
        ],

        # weight components in FeatureUnion
        transformer_weights={
            'tweetSource': 0.5,
            'description': 0.5,
            'flag_org_in_desc': 0.5
        },
    )),

    # Use a SVC classifier on the combined features
    ('tree', RandomForestClassifier(n_estimators = 100, n_jobs=-1)),
    
])

# Define Helper Functions

In [0]:
import json

def write_json(data,filename):
    print("")
    
def read_json(filename):
    if filename:
        with open(filename, 'r') as f:
            datastore = json.load(f)
    
    return datastore

In [0]:
import string

def pre_process(text):
    
    tr = str.maketrans("", "", string.punctuation)
    text=text.translate(tr)
    
    words = text.split()
    words = [word.lower() for word in words]
    words = [word for word in words if word not in string.punctuation]
    sentence = [' '.join(words)]
    
    return sentence

#pre_process("At eight! o'clock on Thursday morning Arthur didn't feel very good.")

In [0]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evalResults(y_test, y_pred): 
    a=accuracy_score(y_test, y_pred)
    b=precision_score(y_test, y_pred,average='weighted')
    c=recall_score(y_test, y_pred,average='weighted')
    d=f1_score(y_test, y_pred,average='weighted')
    
    print("accuracy_score "+str(a)+" precision_score "+str(b)+" recall_score "+str(c)+" f1_score "+str(d))
    
    return a
    #return {'accuracy_score':a,'precision_score':b,'recall_score':c,'f1_score':d}

In [0]:
from sklearn.model_selection import train_test_split

def train(X,y,pipeline):
    x_train, x_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state =42)
    pipeline.fit(x_train,y_train)
    y_pred = pipeline.predict(x_test)
    results=evalResults(y_test, y_pred)
   
    #y_val = pipeline.predict(val)
    
    return results

# Define Models and GridSearch

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [0]:
pipeline1 = Pipeline([
    ('bow', CountVectorizer()),
    ('model', DecisionTreeClassifier(random_state=0))
])

pipeline2 = Pipeline([
    ('bow', CountVectorizer()),
    ('model', RandomForestClassifier(random_state=0))
])

pipeline3 = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', RandomForestClassifier(random_state=0))
])

pipeline4 = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', MultinomialNB())
])

pipeline5 = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', svm.SVC(gamma='scale'))
])

pipeline6 = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', KNeighborsClassifier(n_neighbors=5, weights='uniform'))
])

pipeline7 = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', LogisticRegression(solver='sag', max_iter=100, random_state=0, multi_class='multinomial'))
])

pipelines=[pipeline1,pipeline2,pipeline3,pipeline4,pipeline6,pipeline7]

pipelines_dict = {1: 'DecisionTreeClassifier', 2: 'RandomForestClassifier', 3: 'RandomForestClassifier w TDIDF', 
                  4: 'MultinomialNB', 
                  5: 'KNeighborsClassifier', 6: 'LogisticRegression',}


from sklearn.externals import joblib

def test_pipelines(X,y,pipelines,category):
    results={}
    for idx, gs in enumerate(pipelines):
        print('\nEstimator: %s' % pipelines_dict[idx+1])
        performance = train(X,y,gs)
        results[str(pipelines_dict[idx+1])]=performance
        dump_file = category+'_'+str(pipelines_dict[idx+1])+'.pkl'
        joblib.dump(gs, dump_file, compress=1)
    
    return results


# Run through data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
#Define data files
json_mobile='/content/gdrive/My Drive/Colab Notebooks/NDSC/mobile_profile_train.json'
json_fashion='/content/gdrive/My Drive/Colab Notebooks/NDSC/fashion_profile_train.json'
json_beauty='/content/gdrive/My Drive/Colab Notebooks/NDSC/beauty_profile_train.json'
train_mobile='/content/gdrive/My Drive/Colab Notebooks/NDSC/mobile_data_info_train_competition.csv'
train_fashion='/content/gdrive/My Drive/Colab Notebooks/NDSC/fashion_data_info_train_competition.csv'
train_beauty='/content/gdrive/My Drive/Colab Notebooks/NDSC/beauty_data_info_train_competition.csv'
val_mobile='/content/gdrive/My Drive/Colab Notebooks/NDSC/mobile_data_info_val_competition.csv'
val_fashion='/content/gdrive/My Drive/Colab Notebooks/NDSC/fashion_data_info_val_competition.csv'
val_beauty='/content/gdrive/My Drive/Colab Notebooks/NDSC/beauty_data_info_val_competition.csv'

files = [json_mobile,json_fashion,json_beauty,train_mobile,train_fashion,train_beauty,val_mobile,val_fashion,val_beauty]

In [0]:
import pandas as pd
print('Performing model testing...')

beauty1=pd.read_csv(files[5])

data=beauty1

performance={}
output=pd.DataFrame()
for i in range(3,len(data.columns)):
    print(data.columns[i])
    df=pd.DataFrame(data['title'])
    df[data.columns[i]]=data[data.columns[i]]
    df=df.dropna()
    result = test_pipelines(df.title,df[data.columns[i]],pipelines,data.columns[i])
    performance[data.columns[i]]=result


Performing model testing...
Benefits

Estimator: DecisionTreeClassifier
accuracy_score 0.8679660668682303 precision_score 0.8685041570938932 recall_score 0.8679660668682303 f1_score 0.8679315289207213

Estimator: RandomForestClassifier




accuracy_score 0.8728681715443097 precision_score 0.8736317906369266 recall_score 0.8728681715443097 f1_score 0.8728260213043312

Estimator: RandomForestClassifier w TDIDF




accuracy_score 0.8687292687938474 precision_score 0.8704293280683646 recall_score 0.8687292687938474 f1_score 0.8687784520543956

Estimator: MultinomialNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.7777027622038923 precision_score 0.787536237464487 recall_score 0.7777027622038923 f1_score 0.7729926203451098

Estimator: KNeighborsClassifier
accuracy_score 0.8277218422520327 precision_score 0.8288833885432095 recall_score 0.8277218422520327 f1_score 0.8273641330330225

Estimator: LogisticRegression


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.8500308216162269 precision_score 0.8534711093678925 recall_score 0.8500308216162269 f1_score 0.8500969700789233
Brand

Estimator: DecisionTreeClassifier


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


accuracy_score 0.9866459496913451 precision_score 0.9867304788774451 recall_score 0.9866459496913451 f1_score 0.9865639153108365

Estimator: RandomForestClassifier


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


accuracy_score 0.9842103052954269 precision_score 0.9832057628255125 recall_score 0.9842103052954269 f1_score 0.9834190782180428

Estimator: RandomForestClassifier w TDIDF


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


accuracy_score 0.9864079844342726 precision_score 0.986031931265279 recall_score 0.9864079844342726 f1_score 0.9860395647069295

Estimator: MultinomialNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.8559050378644718 precision_score 0.8648338684400722 recall_score 0.8559050378644718 f1_score 0.8319837327629165

Estimator: KNeighborsClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.963171376978961 precision_score 0.9621906207576043 recall_score 0.963171376978961 f1_score 0.9622155385347858

Estimator: LogisticRegression
accuracy_score 0.9860440375705147 precision_score 0.9869573357680049 recall_score 0.9860440375705147 f1_score 0.9858290279507133
Colour_group

Estimator: DecisionTreeClassifier
accuracy_score 0.8621352821583603 precision_score 0.8636680124811001 recall_score 0.8621352821583603 f1_score 0.8626591550743901

Estimator: RandomForestClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.8652398483433156 precision_score 0.8677781383217507 recall_score 0.8652398483433156 f1_score 0.8656279129700002

Estimator: RandomForestClassifier w TDIDF


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.8648002637507556 precision_score 0.8708543123167615 recall_score 0.8648002637507556 f1_score 0.865481086058399

Estimator: MultinomialNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.7067970767624595 precision_score 0.7622520283894805 recall_score 0.7067970767624595 f1_score 0.6959758476478188

Estimator: KNeighborsClassifier


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


accuracy_score 0.7932853453486455 precision_score 0.7979823574927868 recall_score 0.7932853453486455 f1_score 0.7929072271283415

Estimator: LogisticRegression
accuracy_score 0.8520248365294797 precision_score 0.8666622064355025 recall_score 0.8520248365294797 f1_score 0.8534293737270732
Product_texture

Estimator: DecisionTreeClassifier
accuracy_score 0.9859733384273219 precision_score 0.9859688494441508 recall_score 0.9859733384273219 f1_score 0.985963516593219

Estimator: RandomForestClassifier
accuracy_score 0.9866555690485612 precision_score 0.9866555678212453 recall_score 0.9866555690485612 f1_score 0.9866440227437512

Estimator: RandomForestClassifier w TDIDF
accuracy_score 0.9857959584657998 precision_score 0.9858117440027432 recall_score 0.9857959584657998 f1_score 0.985789342842853

Estimator: MultinomialNB
accuracy_score 0.938067104203905 precision_score 0.9393785637971535 recall_score 0.938067104203905 f1_score 0.9358363271776003

Estimator: KNeighborsClassifier
accuracy_sc

In [0]:
pd.DataFrame.from_dict(performance)

Unnamed: 0,Benefits,Brand,Colour_group,Product_texture,Skin_type
DecisionTreeClassifier,0.867966,0.986646,0.862135,0.985973,0.833077
KNeighborsClassifier,0.827722,0.963171,0.793285,0.949024,0.811733
LogisticRegression,0.850031,0.986044,0.852025,0.98581,0.810763
MultinomialNB,0.777703,0.855905,0.706797,0.938067,0.727958
RandomForestClassifier,0.872868,0.98421,0.86524,0.986656,0.845403
RandomForestClassifier w TDIDF,0.868729,0.986408,0.8648,0.985796,0.838612


In [0]:
temp=pd.DataFrame.from_dict(performance)
temp.to_csv('performance_beauty.csv',index=False)

In [0]:
import pandas as pd
print('Performing model testing...')

#Fashion
data=pd.read_csv(files[4])


performance1={}
output=pd.DataFrame()
for i in range(3,len(data.columns)):
    print(data.columns[i])
    df=pd.DataFrame(data['title'])
    df[data.columns[i]]=data[data.columns[i]]
    df=df.dropna()
    result = test_pipelines(df.title,df[data.columns[i]],pipelines,'fashion_'+data.columns[i])
    performance1[data.columns[i]]=result


Performing model testing...
Pattern

Estimator: DecisionTreeClassifier
accuracy_score 0.9279213391841378 precision_score 0.9289674552814453 recall_score 0.9279213391841378 f1_score 0.928303269639738

Estimator: RandomForestClassifier




accuracy_score 0.9063668129367788 precision_score 0.9058901625165294 recall_score 0.9063668129367788 f1_score 0.9042511899141106

Estimator: RandomForestClassifier w TDIDF




accuracy_score 0.9021818625060946 precision_score 0.9011102074668764 recall_score 0.9021818625060946 f1_score 0.8995787318889213

Estimator: MultinomialNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.7739314155696408 precision_score 0.7925558680293717 recall_score 0.7739314155696408 f1_score 0.7327968113753902

Estimator: KNeighborsClassifier
accuracy_score 0.8115147082723875 precision_score 0.8115545482381237 recall_score 0.8115147082723875 f1_score 0.8065461824265633

Estimator: LogisticRegression
accuracy_score 0.9350926377376889 precision_score 0.9422366297756087 recall_score 0.9350926377376889 f1_score 0.9369602331611113
Collar Type

Estimator: DecisionTreeClassifier
accuracy_score 0.8176405021705972 precision_score 0.8185457658010893 recall_score 0.8176405021705972 f1_score 0.8179971065822939

Estimator: RandomForestClassifier
accuracy_score 0.8219523641910126 precision_score 0.8260496641998112 recall_score 0.8219523641910126 f1_score 0.8212034858800243

Estimator: RandomForestClassifier w TDIDF
accuracy_score 0.8104833978646017 precision_score 0.8156920337036676 recall_score 0.8104833978646017 f1_score 0.8097217951413705

Estimator: MultinomialNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.7176170362548399 precision_score 0.7452409447294929 recall_score 0.7176170362548399 f1_score 0.6848328193258741

Estimator: KNeighborsClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.7264167546638508 precision_score 0.7342557426809688 recall_score 0.7264167546638508 f1_score 0.7255418464822068

Estimator: LogisticRegression
accuracy_score 0.8215710430599554 precision_score 0.824059826839459 recall_score 0.8215710430599554 f1_score 0.8206878976840783
Fashion Trend

Estimator: DecisionTreeClassifier
accuracy_score 0.9110275121243712 precision_score 0.9109846165467661 recall_score 0.9110275121243712 f1_score 0.910908273778328

Estimator: RandomForestClassifier
accuracy_score 0.9028690567919141 precision_score 0.9028793226124721 recall_score 0.9028690567919141 f1_score 0.9017647776531194

Estimator: RandomForestClassifier w TDIDF
accuracy_score 0.8951865113538503 precision_score 0.8955199098335332 recall_score 0.8951865113538503 f1_score 0.8937164389412626

Estimator: MultinomialNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.7992566740697095 precision_score 0.8196146317338838 recall_score 0.7992566740697095 f1_score 0.7847006198703046

Estimator: KNeighborsClassifier
accuracy_score 0.8095227303630512 precision_score 0.8097357608196236 recall_score 0.8095227303630512 f1_score 0.8082511787722844

Estimator: LogisticRegression


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.9161945338349272 precision_score 0.9182741720784864 recall_score 0.9161945338349272 f1_score 0.916007785483325
Clothing Material

Estimator: DecisionTreeClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.8590693257359924 precision_score 0.8604414756278641 recall_score 0.8590693257359924 f1_score 0.8595136607758128

Estimator: RandomForestClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.8401519468186135 precision_score 0.8422604631962943 recall_score 0.8401519468186135 f1_score 0.8385818478818641

Estimator: RandomForestClassifier w TDIDF


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.8355935422602089 precision_score 0.838091308536316 recall_score 0.8355935422602089 f1_score 0.8340687942225078

Estimator: MultinomialNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.7058309591642925 precision_score 0.7335957789811607 recall_score 0.7058309591642925 f1_score 0.6820679604305293

Estimator: KNeighborsClassifier
accuracy_score 0.7280721747388414 precision_score 0.7314763713057041 recall_score 0.7280721747388414 f1_score 0.7252393583276854

Estimator: LogisticRegression


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.8587844254510921 precision_score 0.8757642789401157 recall_score 0.8587844254510921 f1_score 0.8626580151873143
Sleeves

Estimator: DecisionTreeClassifier
accuracy_score 0.980157763579472 precision_score 0.9801774459279128 recall_score 0.980157763579472 f1_score 0.9801656621186539

Estimator: RandomForestClassifier
accuracy_score 0.9624889921492945 precision_score 0.9616077403954593 recall_score 0.9624889921492945 f1_score 0.9603877997111036

Estimator: RandomForestClassifier w TDIDF
accuracy_score 0.959641003541249 precision_score 0.9586152688661835 recall_score 0.959641003541249 f1_score 0.9568616869863337

Estimator: MultinomialNB
accuracy_score 0.8276217421446104 precision_score 0.8518798293184598 recall_score 0.8276217421446104 f1_score 0.8073288450028193

Estimator: KNeighborsClassifier
accuracy_score 0.8211762942421914 precision_score 0.8154612818760297 recall_score 0.8211762942421914 f1_score 0.8165966809407751

Estimator: LogisticRegression
accuracy_score 0.97

In [0]:
temp=pd.DataFrame.from_dict(performance1)
temp.to_csv('performance_fashion.csv',index=False)
temp


Unnamed: 0,Clothing Material,Collar Type,Fashion Trend,Pattern,Sleeves
DecisionTreeClassifier,0.859069,0.817641,0.911028,0.927921,0.980158
KNeighborsClassifier,0.728072,0.726417,0.809523,0.811515,0.821176
LogisticRegression,0.858784,0.821571,0.916195,0.935093,0.979165
MultinomialNB,0.705831,0.717617,0.799257,0.773931,0.827622
RandomForestClassifier,0.840152,0.821952,0.902869,0.906367,0.962489
RandomForestClassifier w TDIDF,0.835594,0.810483,0.895187,0.902182,0.959641


In [0]:
import pandas as pd
print('Performing model testing...')

#Mobile
data=pd.read_csv(files[3])


performance2={}
output=pd.DataFrame()
for i in range(3,len(data.columns)):
    print(data.columns[i])
    df=pd.DataFrame(data['title'])
    df[data.columns[i]]=data[data.columns[i]]
    df=df.dropna()
    result = test_pipelines(df.title,df[data.columns[i]],pipelines,'mobile_'+data.columns[i])
    performance2[data.columns[i]]=result


Performing model testing...
Operating System

Estimator: DecisionTreeClassifier
accuracy_score 0.9645170789163722 precision_score 0.9643519054740965 recall_score 0.9645170789163722 f1_score 0.9643900725094192

Estimator: RandomForestClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.9678297997644287 precision_score 0.9677186961992564 recall_score 0.9678297997644287 f1_score 0.9675038612364727

Estimator: RandomForestClassifier w TDIDF


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.9672408716136631 precision_score 0.9669313519874084 recall_score 0.9672408716136631 f1_score 0.9667515070365872

Estimator: MultinomialNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.9295494699646644 precision_score 0.9244338063432269 recall_score 0.9295494699646644 f1_score 0.9204896206760546

Estimator: KNeighborsClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.958186101295642 precision_score 0.9574631562713628 recall_score 0.958186101295642 f1_score 0.957416263595703

Estimator: LogisticRegression
accuracy_score 0.9641489988221437 precision_score 0.9628337469287646 recall_score 0.9641489988221437 f1_score 0.9622305884665079
Features

Estimator: DecisionTreeClassifier
accuracy_score 0.7140799728905456 precision_score 0.7137554123441797 recall_score 0.7140799728905456 f1_score 0.7133157286852349

Estimator: RandomForestClassifier
accuracy_score 0.741867163673331 precision_score 0.7460866735845868 recall_score 0.741867163673331 f1_score 0.7404363088086084

Estimator: RandomForestClassifier w TDIDF
accuracy_score 0.7339037614368011 precision_score 0.7417850138223269 recall_score 0.7339037614368011 f1_score 0.7325457060295204

Estimator: MultinomialNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.7144188410708234 precision_score 0.7338467808946797 recall_score 0.7144188410708234 f1_score 0.7036326360140603

Estimator: KNeighborsClassifier
accuracy_score 0.7188241274144358 precision_score 0.7227753395629836 recall_score 0.7188241274144358 f1_score 0.7166429092634424

Estimator: LogisticRegression
accuracy_score 0.740765842087428 precision_score 0.7501691943655495 recall_score 0.740765842087428 f1_score 0.737108315532301
Network Connections

Estimator: DecisionTreeClassifier
accuracy_score 0.8871656232214001 precision_score 0.8835364134777758 recall_score 0.8871656232214001 f1_score 0.8846117699010444

Estimator: RandomForestClassifier
accuracy_score 0.8986909504837791 precision_score 0.8956620012865028 recall_score 0.8986909504837791 f1_score 0.8921535894077711

Estimator: RandomForestClassifier w TDIDF
accuracy_score 0.8965566306203756 precision_score 0.8940514823211244 recall_score 0.8965566306203756 f1_score 0.8887215769188056

Estimator: MultinomialNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.8767785998861696 precision_score 0.8791472667345837 recall_score 0.8767785998861696 f1_score 0.8615399656025197

Estimator: KNeighborsClassifier
accuracy_score 0.8841775754126352 precision_score 0.8793085892602199 recall_score 0.8841775754126352 f1_score 0.8755839530041842

Estimator: LogisticRegression


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.900398406374502 precision_score 0.8988100219692292 recall_score 0.900398406374502 f1_score 0.8923041366950952
Memory RAM

Estimator: DecisionTreeClassifier
accuracy_score 0.8719611968209444 precision_score 0.8712369267564924 recall_score 0.8719611968209444 f1_score 0.8713794773064808

Estimator: RandomForestClassifier
accuracy_score 0.8689223936418887 precision_score 0.8679563725533044 recall_score 0.8689223936418887 f1_score 0.867318306178453

Estimator: RandomForestClassifier w TDIDF
accuracy_score 0.8605072463768116 precision_score 0.859689278320029 recall_score 0.8605072463768116 f1_score 0.8590161786896854

Estimator: MultinomialNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.7928938756428238 precision_score 0.8054765402458989 recall_score 0.7928938756428238 f1_score 0.7817163675731492

Estimator: KNeighborsClassifier
accuracy_score 0.7889200561009818 precision_score 0.787456891546273 recall_score 0.7889200561009818 f1_score 0.7871450002960249

Estimator: LogisticRegression
accuracy_score 0.8693899018232819 precision_score 0.8704910928525722 recall_score 0.8693899018232819 f1_score 0.8668849644201458
Brand

Estimator: DecisionTreeClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.9849286205710355 precision_score 0.9854086334645245 recall_score 0.9849286205710355 f1_score 0.9850982574273295

Estimator: RandomForestClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.9832946336429309 precision_score 0.9827876301797089 recall_score 0.9832946336429309 f1_score 0.9827465695850867

Estimator: RandomForestClassifier w TDIDF


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.9839611283109735 precision_score 0.983518794590729 recall_score 0.9839611283109735 f1_score 0.9835486322589897

Estimator: MultinomialNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.9108187134502924 precision_score 0.911900983426561 recall_score 0.9108187134502924 f1_score 0.8932429830609042

Estimator: KNeighborsClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.9109907120743034 precision_score 0.9113895679462041 recall_score 0.9109907120743034 f1_score 0.910305886885849

Estimator: LogisticRegression
accuracy_score 0.9860896112831098 precision_score 0.9859283844419234 recall_score 0.9860896112831098 f1_score 0.9857599677304276
Warranty Period

Estimator: DecisionTreeClassifier
accuracy_score 0.8161142698654833 precision_score 0.803738552724797 recall_score 0.8161142698654833 f1_score 0.808935247828957

Estimator: RandomForestClassifier
accuracy_score 0.8517542643184024 precision_score 0.8301586757746966 recall_score 0.8517542643184024 f1_score 0.8310778516935867

Estimator: RandomForestClassifier w TDIDF
accuracy_score 0.8518236028290113 precision_score 0.828021996111184 recall_score 0.8518236028290113 f1_score 0.8292781022226025

Estimator: MultinomialNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.8164609624185273 precision_score 0.7870657316923872 recall_score 0.8164609624185273 f1_score 0.757914709479258

Estimator: KNeighborsClassifier
accuracy_score 0.8383719317709055 precision_score 0.8118328747076954 recall_score 0.8383719317709055 f1_score 0.816984881856333

Estimator: LogisticRegression


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.8425322424074331 precision_score 0.8123998994399579 recall_score 0.8425322424074331 f1_score 0.8067520781670225
Storage Capacity

Estimator: DecisionTreeClassifier
accuracy_score 0.9353391396146741 precision_score 0.9353757261761237 recall_score 0.9353391396146741 f1_score 0.9351676880852123

Estimator: RandomForestClassifier
accuracy_score 0.9367643177619425 precision_score 0.937411558234947 recall_score 0.9367643177619425 f1_score 0.9363346362199021

Estimator: RandomForestClassifier w TDIDF
accuracy_score 0.9360781208762207 precision_score 0.9365652685519795 recall_score 0.9360781208762207 f1_score 0.9357600943715384

Estimator: MultinomialNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.8726313011348641 precision_score 0.8799772266785079 recall_score 0.8726313011348641 f1_score 0.865193658681688

Estimator: KNeighborsClassifier
accuracy_score 0.8218527315914489 precision_score 0.8233830615066203 recall_score 0.8218527315914489 f1_score 0.8207310988108066

Estimator: LogisticRegression


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.939667458432304 precision_score 0.9411007045489993 recall_score 0.939667458432304 f1_score 0.938861996580553
Color Family

Estimator: DecisionTreeClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.8153241650294696 precision_score 0.814880799694603 recall_score 0.8153241650294696 f1_score 0.8139542523699446

Estimator: RandomForestClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.8154813359528488 precision_score 0.8212701129455656 recall_score 0.8154813359528488 f1_score 0.8122649755442227

Estimator: RandomForestClassifier w TDIDF


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.8148919449901768 precision_score 0.8225889084693989 recall_score 0.8148919449901768 f1_score 0.8116194411638595

Estimator: MultinomialNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.6989783889980353 precision_score 0.7551434679978853 recall_score 0.6989783889980353 f1_score 0.6640942335868129

Estimator: KNeighborsClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.6742632612966601 precision_score 0.6810178116175537 recall_score 0.6742632612966601 f1_score 0.6655485486651312

Estimator: LogisticRegression
accuracy_score 0.8231827111984283 precision_score 0.8388309173104596 recall_score 0.8231827111984283 f1_score 0.8190393393292703
Phone Model

Estimator: DecisionTreeClassifier


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


accuracy_score 0.9221393917879541 precision_score 0.9242753682336339 recall_score 0.9221393917879541 f1_score 0.9215029863571734

Estimator: RandomForestClassifier


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


accuracy_score 0.9088470792411154 precision_score 0.9082517737947425 recall_score 0.9088470792411154 f1_score 0.9061302453197247

Estimator: RandomForestClassifier w TDIDF


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


accuracy_score 0.9074665720033133 precision_score 0.9073801914466312 recall_score 0.9074665720033133 f1_score 0.9046506767184763

Estimator: MultinomialNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.597838520096241 precision_score 0.675293058715792 recall_score 0.597838520096241 f1_score 0.5513425544841871

Estimator: KNeighborsClassifier


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


accuracy_score 0.7953299412298347 precision_score 0.8013439908378833 recall_score 0.7953299412298347 f1_score 0.7944120680885433

Estimator: LogisticRegression
accuracy_score 0.9153551847907545 precision_score 0.9104682473605784 recall_score 0.9153551847907545 f1_score 0.9069073091717191
Camera

Estimator: DecisionTreeClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.6215263657313271 precision_score 0.6222209317647514 recall_score 0.6215263657313271 f1_score 0.6208969506775837

Estimator: RandomForestClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.6418598346211197 precision_score 0.64042015297284 recall_score 0.6418598346211197 f1_score 0.6378679910468299

Estimator: RandomForestClassifier w TDIDF


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.6299308662057747 precision_score 0.6290217146287568 recall_score 0.6299308662057747 f1_score 0.6249473440355111

Estimator: MultinomialNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy_score 0.5629659753287244 precision_score 0.6299630382755982 recall_score 0.5629659753287244 f1_score 0.5219587851012545

Estimator: KNeighborsClassifier


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


accuracy_score 0.6242374949166328 precision_score 0.6241613833356899 recall_score 0.6242374949166328 f1_score 0.6213402262871813

Estimator: LogisticRegression
accuracy_score 0.6476887623695269 precision_score 0.6497057000821879 recall_score 0.6476887623695269 f1_score 0.6363170658254919
Phone Screen Size

Estimator: DecisionTreeClassifier
accuracy_score 0.6946681618921399 precision_score 0.6942662951433761 recall_score 0.6946681618921399 f1_score 0.6933829986546622

Estimator: RandomForestClassifier
accuracy_score 0.7163829136507289 precision_score 0.7189202030222395 recall_score 0.7163829136507289 f1_score 0.7140473663246092

Estimator: RandomForestClassifier w TDIDF
accuracy_score 0.7025180956264655 precision_score 0.7073150800905172 recall_score 0.7025180956264655 f1_score 0.6992878601398235

Estimator: MultinomialNB
accuracy_score 0.6197369762463044 precision_score 0.65733426244591 recall_score 0.6197369762463044 f1_score 0.5936320773816915

Estimator: KNeighborsClassifier
accurac

In [0]:
temp=pd.DataFrame.from_dict(performance2)
temp.to_csv('performance_mobile.csv',index=False)
temp


Unnamed: 0,Brand,Camera,Color Family,Features,Memory RAM,Network Connections,Operating System,Phone Model,Phone Screen Size,Storage Capacity,Warranty Period
DecisionTreeClassifier,0.984929,0.621526,0.815324,0.71408,0.871961,0.887166,0.964517,0.922139,0.694668,0.935339,0.816114
KNeighborsClassifier,0.910991,0.624237,0.674263,0.718824,0.78892,0.884178,0.958186,0.79533,0.689469,0.821853,0.838372
LogisticRegression,0.98609,0.647689,0.823183,0.740766,0.86939,0.900398,0.964149,0.915355,0.705882,0.939667,0.842532
MultinomialNB,0.910819,0.562966,0.698978,0.714419,0.792894,0.876779,0.929549,0.597839,0.619737,0.872631,0.816461
RandomForestClassifier,0.983295,0.64186,0.815481,0.741867,0.868922,0.898691,0.96783,0.908847,0.716383,0.936764,0.851754
RandomForestClassifier w TDIDF,0.983961,0.629931,0.814892,0.733904,0.860507,0.896557,0.967241,0.907467,0.702518,0.936078,0.851824


# Format data for submission