From 7e8d232c41d381077b46562477f9259861b99eee Mon Sep 17 00:00:00 2001 From: Sourcery AI <> Date: Wed, 6 Apr 2022 09:25:49 +0000 Subject: [PATCH] 'Refactored by Sourcery' --- friendster_network.py | 6 +-- helpers/gradient_descent.py | 4 +- helpers/probabilty.py | 4 +- helpers/stats.py | 9 ++-- hparams_grid_search_keras_nn.py | 3 +- k_means_clustering/utils.py | 26 +++++------ k_nearest_neighbors/utils.py | 7 +-- multiple_regression/utils.py | 4 +- natural_language_processing/utils.py | 17 +++---- sonar_clf_rf.py | 66 ++++++++++++---------------- working_with_data/utils.py | 5 +-- 11 files changed, 64 insertions(+), 87 deletions(-) diff --git a/friendster_network.py b/friendster_network.py index f006133..add6c10 100644 --- a/friendster_network.py +++ b/friendster_network.py @@ -190,12 +190,10 @@ def tenure_bucket(tenure): def predict_paid_or_unpaid(years_experience): - if years_experience < 3.0: + if years_experience < 3.0 or years_experience >= 8.5: return "paid" - elif years_experience < 8.5: - return "unpaid" else: - return "paid" + return "unpaid" ####################### diff --git a/helpers/gradient_descent.py b/helpers/gradient_descent.py index 246a684..d2b72e5 100644 --- a/helpers/gradient_descent.py +++ b/helpers/gradient_descent.py @@ -166,7 +166,7 @@ def maximize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01): print("using the gradient") - v = [random.randint(-10, 10) for i in range(3)] + v = [random.randint(-10, 10) for _ in range(3)] tolerance = 0.0000001 @@ -183,7 +183,7 @@ def maximize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01): print() print("using minimize_batch") - v = [random.randint(-10, 10) for i in range(3)] + v = [random.randint(-10, 10) for _ in range(3)] v = minimize_batch(sum_of_squares, sum_of_squares_gradient, v) diff --git a/helpers/probabilty.py b/helpers/probabilty.py index 618188b..39ac0d8 100644 --- a/helpers/probabilty.py +++ b/helpers/probabilty.py @@ -121,8 +121,8 @@ def make_hist(p, n, num_points): older = random_kid() if older == "girl": older_girl += 1 - if older == "girl" and younger == "girl": - both_girls += 1 + if younger == "girl": + both_girls += 1 if older == "girl" or younger == "girl": either_girl += 1 diff --git a/helpers/stats.py b/helpers/stats.py index 8fa8bc9..c3416a5 100644 --- a/helpers/stats.py +++ b/helpers/stats.py @@ -50,11 +50,10 @@ def median(v): if n % 2 == 1: # if odd, return the middle value return sorted_v[midpoint] - else: - # if even, return the average of the middle values - lo = midpoint - 1 - hi = midpoint - return (sorted_v[lo] + sorted_v[hi]) / 2 + # if even, return the average of the middle values + lo = midpoint - 1 + hi = midpoint + return (sorted_v[lo] + sorted_v[hi]) / 2 def quantile(x, p): diff --git a/hparams_grid_search_keras_nn.py b/hparams_grid_search_keras_nn.py index 561a631..9e19ed0 100644 --- a/hparams_grid_search_keras_nn.py +++ b/hparams_grid_search_keras_nn.py @@ -13,8 +13,7 @@ def load_data(filepath): - data = pd.read_csv(filepath) - return data + return pd.read_csv(filepath) def describe_data(data, name): diff --git a/k_means_clustering/utils.py b/k_means_clustering/utils.py index ef31e04..42989b6 100644 --- a/k_means_clustering/utils.py +++ b/k_means_clustering/utils.py @@ -32,9 +32,7 @@ def train(self, inputs): # and compute the new means based on the new assignments for i in range(self.k): - i_points = [p for p, a in zip(inputs, assignments) if a == i] - - if i_points: + if i_points := [p for p, a in zip(inputs, assignments) if a == i]: self.means[i] = vector_mean(i_points) @@ -104,26 +102,28 @@ def cluster_distance(cluster1, cluster2, distance_agg=min): def get_merge_order(cluster): - if is_leaf(cluster): - return float('inf') - else: - return cluster[0] + return float('inf') if is_leaf(cluster) else cluster[0] def bottom_up_cluster(inputs, distance_agg=min): # start with every input leaf cluster - clusters = [input for input in inputs] + clusters = list(inputs) # as long as we have more than one cluster left... while len(clusters) > 1: # find the two closest clusters - c1, c2 = min([(cluster1, cluster2) - for i, cluster1 in enumerate(clusters) - for cluster2 in clusters[:i]], - key=lambda p: cluster_distance(p[0], p[1], distance_agg)) + c1, c2 = min( + ( + (cluster1, cluster2) + for i, cluster1 in enumerate(clusters) + for cluster2 in clusters[:i] + ), + key=lambda p: cluster_distance(p[0], p[1], distance_agg), + ) + # remove them from the list of clusters - clusters = [c for c in clusters if c != c1 and c != c2] + clusters = [c for c in clusters if c not in [c1, c2]] # merge them, using merge _order = # of cluster left merged_cluster = (len(clusters), [c1, c2]) diff --git a/k_nearest_neighbors/utils.py b/k_nearest_neighbors/utils.py index 1c5254c..f3024d7 100644 --- a/k_nearest_neighbors/utils.py +++ b/k_nearest_neighbors/utils.py @@ -21,10 +21,7 @@ def majority_vote(labels): for count in vote_counts.values() if count == winner_count]) - if num_winners == 1: - return winner # unique winner, so return it - else: - return majority_vote(labels[:-1]) # try again without the farthest + return winner if num_winners == 1 else majority_vote(labels[:-1]) def knn_classify(k, labeled_points, new_point): @@ -91,7 +88,7 @@ def classify_and_plot_grid(k=1): plt.legend(loc=0) # let matplotlib choose the location plt.axis([-130,-60,20,55]) # set the axes - plt.title(str(k) + "-Nearest Neighbor Programming Languages") + plt.title(f'{str(k)}-Nearest Neighbor Programming Languages') plt.show() # diff --git a/multiple_regression/utils.py b/multiple_regression/utils.py index a10176b..9db7e9f 100644 --- a/multiple_regression/utils.py +++ b/multiple_regression/utils.py @@ -31,7 +31,7 @@ def total_sum_of_squares(y): def estimate_beta(x, y): - beta_initial = [random.random() for x_i in x[0]] + beta_initial = [random.random() for _ in x[0]] return minimize_stochastic(squared_error, squared_error_gradient, x, y, @@ -99,7 +99,7 @@ def squared_error_ridge_gradient(x_i, y_i, beta, alpha): def estimate_beta_ridge(x, y, alpha): """use gradient descent to fit a ridge regression with penalty alpha""" - beta_initial = [random.random() for x_i in x[0]] + beta_initial = [random.random() for _ in x[0]] return minimize_stochastic(partial(squared_error_ridge, alpha=alpha), partial(squared_error_ridge_gradient, alpha=alpha), diff --git a/natural_language_processing/utils.py b/natural_language_processing/utils.py index ff2438d..687ddaa 100644 --- a/natural_language_processing/utils.py +++ b/natural_language_processing/utils.py @@ -128,10 +128,7 @@ def random_y_given_x(x): def random_x_given_y(y): - if y <= 7: - return random.randrange(1, y) - else: - return random.randrange(y - 6, 7) + return random.randrange(1, y) if y <= 7 else random.randrange(y - 6, 7) def gibbs_sampling(num_iters=100): @@ -171,9 +168,7 @@ def sample_from(weights): topic_counts = [0 for _ in range(K)] document_lengths = [len(d) for d in documents] -distinct_words = set(word - for document in documents - for word in document) +distinct_words = {word for document in documents for word in document} W = len(distinct_words) D = len(documents) @@ -203,8 +198,10 @@ def choose_new_topic(d, word): random.seed(0) -document_topics = [[random.randrange(K) for word in document] - for document in documents] +document_topics = [ + [random.randrange(K) for _ in document] for document in documents +] + for d in range(D): for word, topic in zip(documents[d], document_topics[d]): @@ -212,7 +209,7 @@ def choose_new_topic(d, word): topic_word_counts[topic][word] += 1 topic_counts[topic] += 1 -for iter in range(1000): +for _ in range(1000): for d in range(D): for i, (word, topic) in enumerate(zip(documents[d], document_topics[d])): # remove this word/topic from the counts diff --git a/sonar_clf_rf.py b/sonar_clf_rf.py index 2c4f836..e4ea7f6 100644 --- a/sonar_clf_rf.py +++ b/sonar_clf_rf.py @@ -5,14 +5,10 @@ def load_csv(filename): """This method loads a csv file""" - dataset = list() + dataset = [] with open(filename, 'r') as file: csv_reader = reader(file) - for row in csv_reader: - if not row: - continue - dataset.append(row) - + dataset.extend(row for row in csv_reader if row) return dataset @@ -26,10 +22,7 @@ def str_columm_to_int(dataset, column): """This method converts a string column to int""" class_values = [row[column] for row in dataset] unique = set(class_values) - lookup = dict() - - for i, value in enumerate(unique): - lookup[value] = i + lookup = {value: i for i, value in enumerate(unique)} for row in dataset: row[column] = lookup[row[column]] @@ -39,12 +32,12 @@ def str_columm_to_int(dataset, column): def cross_validation_split(dataset, k_folds): """This method splits a dataset into k folds""" - dataset_split = list() + dataset_split = [] dataset_copy = list(dataset) fold_size = int(len(dataset) / k_folds) - for i in range(k_folds): - fold = list() + for _ in range(k_folds): + fold = [] while(len(fold) < fold_size): index = randrange(len(dataset_copy)) fold.append(dataset_copy.pop(index)) @@ -55,25 +48,21 @@ def cross_validation_split(dataset, k_folds): def accuracy_score(actual, predicted): """This method predicts the accuracy percentage""" - correct = 0 - for i in range(len(actual)): - if actual[i] == predicted[i]: - correct += 1 - + correct = sum(actual[i] == predicted[i] for i in range(len(actual))) return correct / float(len(actual)) * 100.0 def evaluate_algorithm(dataset, algorithm, k_folds, *args): """This method evaluates the algorithm using a cross validation split""" folds = cross_validation_split(dataset, k_folds) - scores = list() + scores = [] for fold in folds: train_set = list(folds) train_set.remove(fold) train_set = sum(train_set, []) - test_set = list() + test_set = [] for row in fold: row_copy = list(row) @@ -105,7 +94,7 @@ def test_split(index, value, dataset): def gini_index(groups, classes): """This method calculates the gini index for a split dataset""" # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) + n_instances = float(sum(len(group) for group in groups)) # sum weighted gini index for each group gini = 0.0 for group in groups: @@ -126,9 +115,9 @@ def gini_index(groups, classes): def get_split(dataset, n_features): """This method selects the best split for the dataset""" - class_values = list(set(row[-1] for row in dataset)) + class_values = list({row[-1] for row in dataset}) b_index, b_value, b_score, b_groups = 999, 999, 999, None - features = list() + features = [] while len(features) < n_features : index = randrange(len(dataset[0]) - 1) @@ -189,21 +178,22 @@ def build_tree(train, max_depth, min_size, n_features): def predict(node, row): """This method makes a prediction with a decision tree""" - if row[node['index']] < node['value']: - if isinstance(node['left'], dict): - return predict(node['left'], row) - else: - return node['left'] + if row[node['index']] >= node['value']: + return ( + predict(node['right'], row) + if isinstance(node['right'], dict) + else node['right'] + ) + + if isinstance(node['left'], dict): + return predict(node['left'], row) else: - if isinstance(node['right'], dict): - return predict(node['right'], row) - else: - return node['right'] + return node['left'] def subsample(dataset, ratio): """This method creates a random subsample from the dataset with replacement""" - sample = list() + sample = [] n_sample = round(len(dataset) * ratio) while len(sample) < n_sample: index = randrange(len(dataset)) @@ -219,22 +209,22 @@ def bagging_predict(trees, row): def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features): """Random Forest Algorithm""" - trees = list() - for i in range(n_trees): + trees = [] + for _ in range(n_trees): sample = subsample(train, sample_size) tree = build_tree(sample, max_depth, min_size, n_features) trees.append(tree) - predictions = [bagging_predict(trees, row) for row in test] - return predictions + return [bagging_predict(trees, row) for row in test] """Test run the algorithm""" + seed(2) # load and prepare the data filename = "/home/amogh/PycharmProjects/deeplearning/indie_projects/sonar_data.csv" dataset = load_csv(filename) # convert string attributes to integers -for i in range(0, len(dataset[0]) - 1): +for i in range(len(dataset[0]) - 1): str_column_to_float(dataset, i) # convert class columns to integers str_columm_to_int(dataset, len(dataset[0]) - 1) diff --git a/working_with_data/utils.py b/working_with_data/utils.py index ee26eda..e10151d 100644 --- a/working_with_data/utils.py +++ b/working_with_data/utils.py @@ -145,10 +145,7 @@ def parse_row(input_row, parsers): def try_parse_field(field_name, value, parser_dict): """try to parse value using the appropriate function from parser_dict""" parser = parser_dict.get(field_name) # None if no such entry - if parser is not None: - return try_or_none(parser)(value) - else: - return value + return try_or_none(parser)(value) if parser is not None else value def parse_dict(input_dict, parser_dict):