-
Notifications
You must be signed in to change notification settings - Fork 0
/
PredictiveFeaturesSelection.py
130 lines (115 loc) · 7.15 KB
/
PredictiveFeaturesSelection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.regression import RandomForestRegressor
from pyspark.sql import SparkSession
from PredictionAlgorithms.PredictiveDataTransformation import PredictiveDataTransformation
from PredictionAlgorithms.PredictiveStatisticalTest import PredictiveStatisticalTest
from PredictionAlgorithms.PredictiveUtilities import PredictiveUtilities
from PredictionAlgorithms.PredictiveConstants import PredictiveConstants
class PredictiveFeaturesSelection:
def __init__(self,spark):
self.spark = spark
def featuresSelection(self, dataset_add, feature_colm,
label_colm, relation_list, relation, userId, algoName,
locationAddress):
dataset = self.spark.read.parquet(dataset_add)
# PredictiveUtilities = PredictiveUtilities()
# changing the relationship of the colm(log,squareroot,exponential)
dataTransformationObj = PredictiveDataTransformation(dataset=dataset)
dataset = dataTransformationObj.colmTransformation(colmTransformationList=relation_list) \
if relation == PredictiveConstants.NON_LINEAR else dataset
# transformation
dataTransformationObj = PredictiveDataTransformation(dataset=dataset)
dataTransformationResult = dataTransformationObj.dataTranform(labelColm=label_colm,
featuresColm=feature_colm,
userId=userId)
dataset = dataTransformationResult.get(PredictiveConstants.DATASET)
categoricalFeatures = dataTransformationResult.get(PredictiveConstants.CATEGORICALFEATURES)
numericalFeatures = dataTransformationResult.get(PredictiveConstants.NUMERICALFEATURES)
maxCategories = dataTransformationResult.get(PredictiveConstants.MAXCATEGORIES)
categoryColmStats = dataTransformationResult.get(PredictiveConstants.CATEGORYCOLMSTATS)
indexedFeatures = dataTransformationResult.get(PredictiveConstants.INDEXEDFEATURES)
label = dataTransformationResult.get(PredictiveConstants.LABEL)
idNameFeaturesOrdered = dataTransformationResult.get(PredictiveConstants.IDNAMEFEATURESORDERED)
oneHotEncodedFeaturesList = dataTransformationResult.get(PredictiveConstants.ONEHOTENCODEDFEATURESLIST)
indexedLabelNameDict = dataTransformationResult.get(PredictiveConstants.INDEXEDLABELNAMEDICT)
featuresColm = dataTransformationResult.get(PredictiveConstants.VECTORFEATURES)
# statistics
columnListForfeaturesStats = numericalFeatures.copy()
columnListForfeaturesStats.insert(0, label)
dataTransformationObj = PredictiveDataTransformation(dataset=dataset)
dataStatsResult = \
dataTransformationObj.dataStatistics(categoricalFeatures=categoricalFeatures,
numericalFeatures=columnListForfeaturesStats,
categoricalColmStat=categoryColmStats)
summaryDict = dataStatsResult
# creating the dataset for statschart visualization in features selection chart
datasetForStatsChart = dataset.select(columnListForfeaturesStats)
datasetForStatsChartFileName = \
PredictiveUtilities.writeToParquet(fileName="datasetForStatsChart",
locationAddress=locationAddress,
userId=userId,
data=datasetForStatsChart)
featuresStatsDict = {"columnsName": columnListForfeaturesStats,
"datasetFileName": datasetForStatsChartFileName}
# applying the algorithm
##calling the pearson test
trainData, testData = dataset.randomSplit([0.80, 0.20], seed=40)
keyStatsTest = ''
statisticalTestResult = {}
if algoName == PredictiveConstants.RANDOMREGRESSOR:
statisticalTestObj = PredictiveStatisticalTest(dataset=dataset,
features=numericalFeatures,
labelColm=label)
statisticalTestResult = statisticalTestObj.pearsonTest()
randomForestModel = \
RandomForestRegressor(labelCol=label,
featuresCol=featuresColm,
numTrees=10)
keyStatsTest = "pearson_test_data"
if algoName == PredictiveConstants.RANDOMCLASSIFIER:
statisticalTestObj = PredictiveStatisticalTest(dataset=dataset,
features=indexedFeatures,
labelColm=label)
statisticalTestResult = \
statisticalTestObj.chiSquareTest(categoricalFeatures=categoricalFeatures,
maxCategories=maxCategories)
randomForestModel = RandomForestClassifier(labelCol=label,
featuresCol=featuresColm,
numTrees=10)
keyStatsTest = "ChiSquareTestData"
randomForestModelFit = randomForestModel.fit(trainData)
# predictions = randomForestModelFit.transform(testData)
print(randomForestModelFit.featureImportances)
# feature_importance = randomForestModelFit.featureImportances.toArray().tolist()
# print(feature_importance)
import pyspark.sql.functions as F
import builtins
round = getattr(builtins, 'round')
featuresImportance = list(randomForestModelFit.featureImportances)
featuresImportance = [round(x, 4) for x in featuresImportance]
featuresImportanceDict = {}
for importance in featuresImportance:
featuresImportanceDict[featuresImportance.index(importance)] = round(importance, 4)
featuresImportanceDictWithName = \
PredictiveUtilities.summaryTable(featuresName=idNameFeaturesOrdered,
featuresStat=featuresImportanceDict)
# feature_importance = randomForestModelFit.featureImportances.toArray().tolist()
# print(feature_importance)
# featureImportance = []
# for x in feature_importance:
# featureImportance.append(round(x, 4))
# features_column_for_user = numericalFeatures + categoricalFeatures
featuresColmList = idNameFeaturesOrdered
feat = []
for val in featuresColmList.values():
feat.append(val)
feature_imp = {PredictiveConstants.FEATURE_IMPORTANCE: featuresImportance, "feature_column": feat}
response_dict = {
PredictiveConstants.FEATURE_IMPORTANCE: feature_imp,
keyStatsTest: statisticalTestResult,
'summaryDict': summaryDict,
'categoricalSummary': categoryColmStats,
"featuresImportanceDict": featuresImportanceDictWithName,
"featuresStatsDict": featuresStatsDict
}
return response_dict