studylad · sourcery-ai · Apr 6, 2022 · sourcery-ai · Apr 6, 2022 · sourcery-ai
diff --git a/friendster_network.py b/friendster_network.py
@@ -190,12 +190,10 @@ def tenure_bucket(tenure):
 
 
 def predict_paid_or_unpaid(years_experience):
-    if years_experience < 3.0:
+    if years_experience < 3.0 or years_experience >= 8.5:
         return "paid"
-    elif years_experience < 8.5:
-        return "unpaid"
     else:
-        return "paid"
+        return "unpaid"
 
 
 #######################

diff --git a/helpers/gradient_descent.py b/helpers/gradient_descent.py
@@ -166,7 +166,7 @@ def maximize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01):
 
     print("using the gradient")
 
-    v = [random.randint(-10, 10) for i in range(3)]
+    v = [random.randint(-10, 10) for _ in range(3)]
 
     tolerance = 0.0000001
 
@@ -183,7 +183,7 @@ def maximize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01):
     print()
     print("using minimize_batch")
 
-    v = [random.randint(-10, 10) for i in range(3)]
+    v = [random.randint(-10, 10) for _ in range(3)]
 
     v = minimize_batch(sum_of_squares, sum_of_squares_gradient, v)
 

diff --git a/helpers/probabilty.py b/helpers/probabilty.py
@@ -121,8 +121,8 @@ def make_hist(p, n, num_points):
         older = random_kid()
         if older == "girl":
             older_girl += 1
-        if older == "girl" and younger == "girl":
-            both_girls += 1
+            if younger == "girl":
+                both_girls += 1
         if older == "girl" or younger == "girl":
             either_girl += 1
 

diff --git a/helpers/stats.py b/helpers/stats.py
@@ -50,11 +50,10 @@ def median(v):
     if n % 2 == 1:
         # if odd,  return the middle value
         return sorted_v[midpoint]
-    else:
-        # if even,  return the average of the middle values
-        lo = midpoint - 1
-        hi = midpoint
-        return (sorted_v[lo] + sorted_v[hi]) / 2
+    # if even,  return the average of the middle values
+    lo = midpoint - 1
+    hi = midpoint
+    return (sorted_v[lo] + sorted_v[hi]) / 2
 
 
 def quantile(x, p):

diff --git a/hparams_grid_search_keras_nn.py b/hparams_grid_search_keras_nn.py
@@ -13,8 +13,7 @@
 
 
 def load_data(filepath):
-    data = pd.read_csv(filepath)
-    return data
+    return pd.read_csv(filepath)
 
 
 def describe_data(data, name):

diff --git a/k_means_clustering/utils.py b/k_means_clustering/utils.py
@@ -32,9 +32,7 @@ def train(self, inputs):
 
             # and compute the new means based on the new assignments
             for i in range(self.k):
-                i_points = [p for p, a in zip(inputs, assignments) if a == i]
-
-                if i_points:
+                if i_points := [p for p, a in zip(inputs, assignments) if a == i]:
                     self.means[i] = vector_mean(i_points)
 
 
@@ -104,26 +102,28 @@ def cluster_distance(cluster1, cluster2, distance_agg=min):
 
 
 def get_merge_order(cluster):
-    if is_leaf(cluster):
-        return float('inf')
-    else:
-        return cluster[0]
+    return float('inf') if is_leaf(cluster) else cluster[0]
 
 
 def bottom_up_cluster(inputs, distance_agg=min):
     # start with every input leaf cluster
-    clusters = [input for input in inputs]
+    clusters = list(inputs)
 
     # as long as we have more than one cluster left...
     while len(clusters) > 1:
         # find the two closest clusters
-        c1, c2 = min([(cluster1, cluster2)
-                      for i, cluster1 in enumerate(clusters)
-                      for cluster2 in clusters[:i]],
-                     key=lambda p: cluster_distance(p[0], p[1], distance_agg))
+        c1, c2 = min(
+            (
+                (cluster1, cluster2)
+                for i, cluster1 in enumerate(clusters)
+                for cluster2 in clusters[:i]
+            ),
+            key=lambda p: cluster_distance(p[0], p[1], distance_agg),
+        )
+
 
         # remove them from the list of clusters
-        clusters = [c for c in clusters if c != c1 and c != c2]
+        clusters = [c for c in clusters if c not in [c1, c2]]
 
         # merge them, using merge _order = # of cluster left
         merged_cluster = (len(clusters), [c1, c2])

diff --git a/k_nearest_neighbors/utils.py b/k_nearest_neighbors/utils.py
@@ -21,10 +21,7 @@ def majority_vote(labels):
                        for count in vote_counts.values()
                        if count == winner_count])
 
-    if num_winners == 1:
-        return winner                     # unique winner, so return it
-    else:
-        return majority_vote(labels[:-1]) # try again without the farthest
+    return winner if num_winners == 1 else majority_vote(labels[:-1])
 
 
 def knn_classify(k, labeled_points, new_point):
@@ -91,7 +88,7 @@ def classify_and_plot_grid(k=1):
 
     plt.legend(loc=0)          # let matplotlib choose the location
     plt.axis([-130,-60,20,55]) # set the axes
-    plt.title(str(k) + "-Nearest Neighbor Programming Languages")
+    plt.title(f'{str(k)}-Nearest Neighbor Programming Languages')
     plt.show()
 
 #

diff --git a/multiple_regression/utils.py b/multiple_regression/utils.py
@@ -31,7 +31,7 @@ def total_sum_of_squares(y):
 
 
 def estimate_beta(x, y):
-    beta_initial = [random.random() for x_i in x[0]]
+    beta_initial = [random.random() for _ in x[0]]
     return minimize_stochastic(squared_error,
                                squared_error_gradient,
                                x, y,
@@ -99,7 +99,7 @@ def squared_error_ridge_gradient(x_i, y_i, beta, alpha):
 def estimate_beta_ridge(x, y, alpha):
     """use gradient descent to fit a ridge regression
     with penalty alpha"""
-    beta_initial = [random.random() for x_i in x[0]]
+    beta_initial = [random.random() for _ in x[0]]
     return minimize_stochastic(partial(squared_error_ridge, alpha=alpha),
                                partial(squared_error_ridge_gradient,
                                        alpha=alpha),

diff --git a/natural_language_processing/utils.py b/natural_language_processing/utils.py
@@ -128,10 +128,7 @@ def random_y_given_x(x):
 
 
 def random_x_given_y(y):
-    if y <= 7:
-        return random.randrange(1, y)
-    else:
-        return random.randrange(y - 6, 7)
+    return random.randrange(1, y) if y <= 7 else random.randrange(y - 6, 7)
 
 
 def gibbs_sampling(num_iters=100):
@@ -171,9 +168,7 @@ def sample_from(weights):
 topic_counts = [0 for _ in range(K)]
 document_lengths = [len(d) for d in documents]
 
-distinct_words = set(word
-                     for document in documents
-                     for word in document)
+distinct_words = {word for document in documents for word in document}
 
 W = len(distinct_words)
 D = len(documents)
@@ -203,16 +198,18 @@ def choose_new_topic(d, word):
 
 
 random.seed(0)
-document_topics = [[random.randrange(K) for word in document]
-                   for document in documents]
+document_topics = [
+    [random.randrange(K) for _ in document] for document in documents
+]
+
 
 for d in range(D):
     for word, topic in zip(documents[d], document_topics[d]):
         document_topic_counts[d][topic] += 1
         topic_word_counts[topic][word] += 1
         topic_counts[topic] += 1
 
-for iter in range(1000):
+for _ in range(1000):
     for d in range(D):
         for i, (word, topic) in enumerate(zip(documents[d], document_topics[d])):
             # remove this word/topic from the counts

diff --git a/sonar_clf_rf.py b/sonar_clf_rf.py
@@ -5,14 +5,10 @@
 
 def load_csv(filename):
     """This method loads a csv file"""
-    dataset = list()
+    dataset = []
     with open(filename, 'r') as file:
         csv_reader = reader(file)
-        for row in csv_reader:
-            if not row:
-                continue
-            dataset.append(row)
-
+        dataset.extend(row for row in csv_reader if row)
     return dataset
 
 
@@ -26,10 +22,7 @@ def str_columm_to_int(dataset, column):
     """This method converts a string column to int"""
     class_values = [row[column] for row in dataset]
     unique = set(class_values)
-    lookup = dict()
-
-    for i, value in enumerate(unique):
-        lookup[value] = i
+    lookup = {value: i for i, value in enumerate(unique)}
 
     for row in dataset:
         row[column] = lookup[row[column]]
@@ -39,12 +32,12 @@ def str_columm_to_int(dataset, column):
 
 def cross_validation_split(dataset, k_folds):
     """This method splits a dataset into k folds"""
-    dataset_split = list()
+    dataset_split = []
     dataset_copy = list(dataset)
     fold_size = int(len(dataset) / k_folds)
 
-    for i in range(k_folds):
-        fold = list()
+    for _ in range(k_folds):
+        fold = []
         while(len(fold) < fold_size):
             index = randrange(len(dataset_copy))
             fold.append(dataset_copy.pop(index))
@@ -55,25 +48,21 @@ def cross_validation_split(dataset, k_folds):
 
 def accuracy_score(actual, predicted):
     """This method predicts the accuracy percentage"""
-    correct = 0
-    for i in range(len(actual)):
-        if actual[i] == predicted[i]:
-            correct += 1
-
+    correct = sum(actual[i] == predicted[i] for i in range(len(actual)))
     return correct / float(len(actual)) * 100.0
 
 
 def evaluate_algorithm(dataset, algorithm, k_folds, *args):
     """This method evaluates the algorithm using a cross validation split"""
     folds = cross_validation_split(dataset, k_folds)
-    scores = list()
+    scores = []
 
     for fold in folds:
         train_set = list(folds)
         train_set.remove(fold)
         train_set = sum(train_set, [])
 
-        test_set = list()
+        test_set = []
 
         for row in fold:
             row_copy = list(row)
@@ -105,7 +94,7 @@ def test_split(index, value, dataset):
 def gini_index(groups, classes):
     """This method calculates the gini index for a split dataset"""
     # count all samples at split point
-    n_instances = float(sum([len(group) for group in groups]))
+    n_instances = float(sum(len(group) for group in groups))
     # sum weighted gini index for each group
     gini = 0.0
     for group in groups:
@@ -126,9 +115,9 @@ def gini_index(groups, classes):
 
 def get_split(dataset, n_features):
     """This method selects the best split for the dataset"""
-    class_values = list(set(row[-1] for row in dataset))
+    class_values = list({row[-1] for row in dataset})
     b_index, b_value, b_score, b_groups = 999, 999, 999, None
-    features = list()
+    features = []
 
     while len(features) < n_features :
         index = randrange(len(dataset[0]) - 1)
@@ -189,21 +178,22 @@ def build_tree(train, max_depth, min_size, n_features):
 
 def predict(node, row):
     """This method makes a prediction with a decision tree"""
-    if row[node['index']] < node['value']:
-        if isinstance(node['left'], dict):
-            return predict(node['left'], row)
-        else:
-            return node['left']
+    if row[node['index']] >= node['value']:
+        return (
+            predict(node['right'], row)
+            if isinstance(node['right'], dict)
+            else node['right']
+        )
+
+    if isinstance(node['left'], dict):
+        return predict(node['left'], row)
     else:
-        if isinstance(node['right'], dict):
-            return predict(node['right'], row)
-        else:
-            return node['right']
+        return node['left']
 
 
 def subsample(dataset, ratio):
     """This method creates a random subsample from the dataset with replacement"""
-    sample = list()
+    sample = []
     n_sample = round(len(dataset) * ratio)
     while len(sample) < n_sample:
         index = randrange(len(dataset))
@@ -219,22 +209,22 @@ def bagging_predict(trees, row):
 
 def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
     """Random Forest Algorithm"""
-    trees = list()
-    for i in range(n_trees):
+    trees = []
+    for _ in range(n_trees):
         sample = subsample(train, sample_size)
         tree = build_tree(sample, max_depth, min_size, n_features)
         trees.append(tree)
-    predictions = [bagging_predict(trees, row) for row in test]
-    return predictions
+    return [bagging_predict(trees, row) for row in test]
 
 
 """Test run the algorithm"""
+
 seed(2)
 # load and prepare the data
 filename = "/home/amogh/PycharmProjects/deeplearning/indie_projects/sonar_data.csv"
 dataset = load_csv(filename)
 # convert string attributes to integers
-for i in range(0, len(dataset[0]) - 1):
+for i in range(len(dataset[0]) - 1):
     str_column_to_float(dataset, i)
 # convert class columns to integers
 str_columm_to_int(dataset, len(dataset[0]) - 1)

diff --git a/working_with_data/utils.py b/working_with_data/utils.py
@@ -145,10 +145,7 @@ def parse_row(input_row, parsers):
 def try_parse_field(field_name, value, parser_dict):
     """try to parse value using the appropriate function from parser_dict"""
     parser = parser_dict.get(field_name) # None if no such entry
-    if parser is not None:
-        return try_or_none(parser)(value)
-    else:
-        return value
+    return try_or_none(parser)(value) if parser is not None else value
 
 
 def parse_dict(input_dict, parser_dict):